Setting The Environment¶

Packages required for installation

In [1]:
# Installing Libraries
!pip install teradatasql
!pip install teradata
!pip install pyodbc
Requirement already satisfied: teradatasql in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (17.10.0.2)
Requirement already satisfied: pycryptodome in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (from teradatasql) (3.11.0)
Requirement already satisfied: teradata in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (15.10.0.21)
Requirement already satisfied: pyodbc in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (4.0.0-unsupported)
In [2]:
# Importing Libraries
import teradatasql
import teradata
import pandas as pd
import getpass
import pyodbc
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np 
from numpy import mean
from numpy import std
import statsmodels.api as sm
import itertools
import uszipcode
from uszipcode import SearchEngine
from geopy.geocoders import Nominatim
from gcmap import GCMapper, Gradient
from statsmodels.tsa.seasonal import seasonal_decompose
from pylab import rcParams
from matplotlib.colors import Normalize, LinearSegmentedColormap, PowerNorm
import plotly.graph_objects as go
import geopandas as gpd
from scipy import stats 
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import skew
from scipy.stats.mstats import winsorize
from scipy.stats import zscore

Teradata Connection

In [3]:
# Connection to teradata with credentials
user = "704135"
password = getpass.getpass()
server = "edtdp.aa.com"
In [4]:
# Defining the connection
con = teradatasql.connect(host=server, user=user, password=password, logmech='LDAP')

Data Pull

In [5]:
# Data pull SQL statement
query = """ SELECT


t1.PARTY_ID, /*unique customer id*/
t1.LYLTY_ACCT_ID, /*loyalty program customer id*/
CASE WHEN t1.LYLTY_ACCT_ID IS NOT NULL THEN 1 ELSE 0 END AS loyalty_Ind,
t1.LYLTY_LEVEL_CD, /*loyalty program category - loyalty customers only*/
t3.MACRO_SEG_ID,
t3.INDVDL_AGE_QTY,
t3.GENDER_CD,
CASE WHEN t3.POSTAL_CD='' THEN null ELSE t3.POSTAL_CD END AS POSTAL_CD,
t3.BKGS_QTY,
t3.AADVAN_MBR_TTL_YEARS_QTY,
t3.AA_OPER_REVNUE_SEG_AMT,
t3.AA_OPER_REVNUE_SEG_QTY,
t3.ACRU_AEM_QTY,
t3.ACRU_FLIGHT_AEM_QTY,
t3.ACRU_FLIGHT_SEG_QTY,
t3.BARCLAYS_CARD_HOLDER_IND,
t3.BKGS_INTL_QTY,
COALESCE(CEILING(t3.BKGS_PAX_QTY_AVG),1) AS BKGS_PAX_QTY_AVG,
CEILING(t3.CHECK_BAGS_PER_TRIP_AVG) AS CHECK_BAGS_PER_TRIP_AVG,
t3.CITIBNK_PREMIM_CARD_HLDR_IND,
t3.FLOWN_SEG_QTY,
t3.LOUNGE_MBR_PRTCPNT_IND,
t3.TRIP_QTY,


CASE
WHEN t3.AA_OPER_REVNUE_SEG_AMT>= 2500 THEN 'High Value'
WHEN t3.AA_OPER_REVNUE_SEG_AMT < 2500 AND t3.BKGS_QTY>1 THEN 'Medium Value'
WHEN t3.AA_OPER_REVNUE_SEG_AMT = 0 OR t3.BKGS_QTY<=1 THEN 'Value'
ELSE 'No Valuation' END AS aa_value_code,


EXTRACT(MONTH FROM t1.TICKET_ISSUE_DT) AS booking_month,
t1.PNR_LOCTR_ID, /*PNR number*/
t1.PNR_CREATE_DT, /*PNR created date*/
t1.TICKET_NBR, /*ticket number*/
t1.TICKET_ISSUE_DT, /*ticket issued date*/
t1.SEG_DEP_DT, /*segment departure date*/
t1.SEG_DEP_DT-t1.TICKET_ISSUE_DT AS days_before_dept,
TD_DAY_OF_WEEK(t1.SEG_DEP_DT) AS day_of_week_dep,
EXTRACT(MONTH FROM t1.SEG_DEP_DT) AS dep_month,
t1.SEG_DEP_AIRPRT_IATA_CD,
t1.SEG_ARVL_AIRPRT_IATA_CD,
t1.MKT_N_DIRECTN_AIRPRT_PAIR_CD, /*pair of locations*/
CASE WHEN t1.DEP_CNTRY_CD IN ('US','CA','GB','MX') THEN t1.DEP_CNTRY_CD ELSE 'Other' END AS DEP_CNTRY_CD, /*dept country*/
CASE WHEN t1.ARVL_CNTRY_CD IN ('US','CA','GB','MX') THEN t1.ARVL_CNTRY_CD ELSE 'Other' END AS ARVL_CNTRY_CD, /*arriv country*/
CASE
WHEN t1.DEP_CNTRY_CD NOT IN 'US' THEN 'INTL'
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('AEX','BHM','CHA','CLE','DAY','FSD','JAN','MCI','MSN','TYS') THEN 'FOCUS'
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('CLT','DFW','EYW','FLL','MIA','PBI','PHL','PHX') THEN 'MAJOR HUB' ----> Major HUB
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('LAX','JFK','ORD') THEN 'MAJOR HUB MULTP' ---> Major HUB for multiple airlines
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('ATL','DEN','DTW','EWR','HOU','MSP','OAK','SBA','SEA','SFO','SJC','SJO','SLC') THEN 'OTH AIRL HUB' ---> Major HUB for another airline
ELSE 'SPOKE' END AS AIRPRT_IATA_GRP,
CASE
WHEN t1.DEP_CNTRY_CD NOT IN 'US' THEN 'INTL'
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('AEX','BHM','CHA','CLE','DAY','FSD','JAN','MCI','MSN','TYS') THEN 'FOCUS'
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('CLT','DFW','EYW','FLL','MIA','PBI','PHL','PHX') THEN t1.SEG_DEP_AIRPRT_IATA_CD ----> Major HUB
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('LAX','JFK','ORD') THEN t1.SEG_DEP_AIRPRT_IATA_CD ---> Major HUB for multiple airlines
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('ATL','DEN','DTW','EWR','HOU','MSP','OAK','SBA','SEA','SFO','SJC','SJO','SLC') THEN t1.SEG_DEP_AIRPRT_IATA_CD ---> Major HUB for another airline
ELSE 'SPOKE' END AS AIRPRT_IATA_TOP,
t1.TACTL_ENTITY, /*do we need this*/
t1.SEG_INTRVL_TYPE, /*segment interval type*/
t1.MILE_GREAT_CIRCLE_DISTANC_QTY,
s2.SEG_GEOG_AREA_CD,
s2.SEG_GEOG_AREA_DESC,


t1.CABIN_BOOKED, /*Y - main cabin, W - premium economy, C = business, F = first*/
t1.CABIN_FLOWN, /*Y - main cabin, W - premium economy, C = business, F = first*/
CASE WHEN t1.UPG_TYPE IS NULL THEN 'NONE' ELSE t1.UPG_TYPE END AS UPG_TYPE,
CASE WHEN t1.UPG_TYPE IS NOT NULL THEN 1 ELSE 0 END AS upgrade_ind,
t1.ACCT_FARE_CLASS_CD,
CASE
WHEN t1.ACCT_FARE_CLASS_CD IN ('F','A') THEN 'First Class'
WHEN t1.ACCT_FARE_CLASS_CD IN ('J','R','D','I') THEN 'Business Class'
WHEN t1.ACCT_FARE_CLASS_CD IN ('W','P') THEN 'Premium Economy'
WHEN t1.ACCT_FARE_CLASS_CD IN ('B') THEN 'Basic Economy' ELSE 'Economy' END AS ACCT_FARE_CLASS_TYPE,
t1.TKT_TYPE, /*ind awd ticket R revenue , A awd, convert or pmpute*/
t1.BX_IND,
CASE WHEN t2.BKG_BUSINES_LEISR_IND IS NULL THEN 'L' ELSE t2.BKG_BUSINES_LEISR_IND END AS BKG_BUSINES_LEISR_IND,
CASE WHEN t1.FLIGHT_REV + COALESCE(s2.ancll_rev_ttl,0) >0 THEN 1 ELSE 0 END AS Spend_Ind,
t1.FLIGHT_REV, /*customer can use awr miles, customer segment level, prorated*/
CASE WHEN s2.ancll_rev_ttl >= 1 THEN 1 ELSE 0 END AS ancillary_spend_ind,
COALESCE(s2.ancll_rev_ttl,0) AS ancll_rev_ttl,
COALESCE(s2.ancll_qty_ttl,0) AS ancll_qty_ttl,
COALESCE(s2.rev_pref_seats,0) AS rev_pref_seats,
COALESCE(s2.qty_pref_seats,0) AS qty_pref_seats,
COALESCE(s2.rev_bag,0) AS rev_bag,
COALESCE(s2.qty_bag,0) AS qty_bag,
COALESCE(s2.rev_bag_non_cat,0) AS rev_bag_non_cat,
COALESCE(s2.qty_bag_non_cat,0) AS qty_bag_non_cat,
COALESCE(s2.rev_mileage_multiplier,0) AS rev_mileage_multiplier,
COALESCE(s2.qty_mileage_multiplier,0) AS qty_mileage_multiplier,
COALESCE(s2.rev_tsa,0) AS rev_tsa,
COALESCE(s2.qty_tsa,0) AS qty_tsa,
COALESCE(s2.rev_upgrade,0) AS rev_upgrade,
COALESCE(s2.qty_upgrade,0) AS qty_upgrade,
COALESCE(s2.ancll_rev_ttl,0)-COALESCE(s2.rev_pref_seats,0)-COALESCE(s2.rev_bag,0)-COALESCE(s2.rev_bag_non_cat,0)-COALESCE(s2.rev_mileage_multiplier,0)-COALESCE(s2.rev_tsa,0)-COALESCE(s2.rev_upgrade,0) AS rev_other_non_cat,
COALESCE(s2.ancll_qty_ttl,0)-COALESCE(s2.qty_pref_seats,0)-COALESCE(s2.qty_bag,0)-COALESCE(s2.qty_bag_non_cat,0)-COALESCE(s2.qty_mileage_multiplier,0)-COALESCE(s2.qty_tsa,0)-COALESCE(s2.qty_upgrade,0) AS qty_other_non_cat
--COALESCE(s2.rev_other_non_cat,0) + COALESCE(s2.rev_non_cat_other,0) AS rev_other_non_cat,
--COALESCE(s2.qty_other_non_cat,0) + COALESCE(s2.qty_non_cat_other,0) AS qty_other_non_cat,
--COALESCE(s2.rev_other_non_cat,0) + COALESCE(s2.rev_non_cat_other,0) AS rev_other_non_cat,
--COALESCE(s2.qty_other_non_cat,0) + COALESCE(s2.qty_non_cat_other,0) AS qty_other_non_cat,


FROM PROD_LYLTY_TRANS_VW.CUST_FLIGHT_ACTVTY_TRANS t1


LEFT JOIN
----------------------------AGGREGATE TABLE FOR MERCH AS S2------------------------------------------------
(SELECT
s1.party_id,
s1.ticket_nbr,
s1.ticket_issue_dt,
s1.SEG_ARVL_AIRPRT_IATA_CD,
s1.SEG_DEP_AIRPRT_IATA_CD,
s1.SEG_GEOG_AREA_CD,
s1.SEG_GEOG_AREA_DESC,
SUM(s1.ANCLRY_PROD_PRICE_USD_AMT) AS ancll_rev_ttl,
SUM(s1.ANCLRY_PROD_SLS_QTY) AS ancll_qty_ttl,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'SA' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_pref_seats,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'SA' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_pref_seats,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'BG' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_bag,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'BG' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_bag,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '0' AND s1.ANCLRY_PROD_COMERCL_NM LIKE ANY ('UP100%','UP50%','UP70%') THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_bag_non_cat,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '0' AND s1.ANCLRY_PROD_COMERCL_NM LIKE ANY ('UP100%','UP50%','UP70%') THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_bag_non_cat,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '0' AND s1.ANCLRY_PROD_COMERCL_NM NOT LIKE ALL ('UP100%','UP50%','UP70%') THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_other_non_cat,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '0' AND s1.ANCLRY_PROD_COMERCL_NM NOT LIKE ALL ('UP100%','UP50%','UP70%') THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_other_non_cat,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'FF' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_mileage_multiplier,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'FF' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_mileage_multiplier,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'TS' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_tsa,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'TS' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_tsa,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'UP' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_upgrade,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'UP' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_upgrade,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '99' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_non_cat_other,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '99' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_non_cat_other
FROM


----------------------------MERCH TABLES UNION AS S1--------------------------------


(SELECT


t1.ANCLRY_PROD_COMERCL_NM, /*group by product type*/
t1.ANCLRY_PROD_PRICE_USD_AMT, /*ancly price*/
t1.ANCLRY_PROD_SLS_QTY, /*quantity*/
--t1.ANCLRY_SLS_AIRLN_ACCT_CD, /*all fields are AA - remove field and add as filter*/
--t1.ANCLRY_SLS_CHANL_CD, /*channel - we can remove this field?*/
--t1.ANCLRY_SLS_ISSUE_DT, /*change to days before departure*/
--t1.DOMNT_AIRLN_IATA_CD, /*dominant airline code - mostly aa - can we drop this too or filter to AA*/
t1.LYLTY_ACCT_ID, /*lylty account id*/
t1.PARTY_ID, /*party id*/
t1.PNR_CREATE_DT, /*pnr create dt*/
t1.PNR_LOCTR_ID, /*pnr lctr id*/
--t1.REV_ACCT_PAYMNT_TYPE_CD, /*payment type - can we drop this as almost all is cc*/
t1.SEG_ARVL_AIRPRT_IATA_CD, /*arrival city*/
t1.SEG_DEP_AIRPRT_IATA_CD, /*dept city*/
t1.SEG_DEP_DT, /*dept date - all null*/
t1.SEG_GEOG_AREA_CD, /*geo code*/
t1.SEG_GEOG_AREA_DESC, /*geo desc*/
t1.TCN_NBR, /*transaction number - check if its needed*/
t1.TICKET_ISSUE_DT, /*ticket issue dt*/
t1.TICKET_NBR, /*ticket number*/
t1.TKT_CART_NBR, /*mostly null*/
--t1.TKT_MKT_AIRLN_IATA_CD, /*should we limit to AA?*/
CAST (000 AS FLOAT) AS ANCLRY_SLS_AMT_CONV_USD,
CAST(000 AS VARCHAR(20)) AS PRAS_DOC_STATUS_DESC,
CAST(000 AS VARCHAR(20)) AS ANCLRY_PROD_GROUP_CD
--CAST(000 AS VARCHAR(20)) AS SPCL_SRVC_CD
FROM PROD_MERCH_VW.MERCH_ANCLRY_NON_CTLG_SLS t1
--WHERE 1=1
--AND t1.ANCLRY_SLS_AIRLN_ACCT_CD = '001'
--AND COALESCE(t1.ANCLRY_PRD_PRICE_LCL_CRNCY_CD,0) = 'USD'
--AND t1.PARTY_ID = '159011984'
--AND t1.ticket_nbr = '2386605942'
--AND t1.PNR_LOCTR_ID IS NOT NULL
--AND t1.SEG_DEP_DT BETWEEN CURRENT_DATE-360 AND CURRENT_DATE


UNION ALL


SELECT


t2.ANCLRY_PROD_COMERCL_NM, /*product name*/
t2.ANCLRY_PROD_PRICE_USD_AMT, /*ancly price*/
t2.ANCLRY_PROD_SLS_QTY, /*quantity*/
--t2.ANCLRY_SLS_AIRLN_ACCT_CD, /*all fields are AA - remove field and add as filter*/
--t2.ANCLRY_SLS_CHANL_CD, /*channel - keep*/
--t2.ANCLRY_SLS_ISSUE_DT, /*change to days before departure*/
--CAST(000 AS VARCHAR(20)) AS DOMNT_AIRLN_IATA_CD, /*drop*/
t2.LYLTY_ACCT_ID, /*start with*/
t2.PARTY_ID, /*lylty and non lylty*/
t2.PNR_CREATE_DT, /*pnr create dt*/
t2.PNR_LOCTR_ID, /*pnr lctr id*/
--t2.REV_ACCT_PAYMNT_TYPE_CD, /*all cc*/
t2.SEG_ARVL_AIRPRT_IATA_CD, /*arrival city*/
t2.SEG_DEP_AIRPRT_IATA_CD, /*dept city*/
t2.SEG_DEP_DT, /*dept date - all null*/
t2.SEG_GEOG_AREA_CD, /*geo code*/
t2.SEG_GEOG_AREA_DESC, /*geo desc*/
t2.TCN_NBR, /*transaction number - check if its needed*/
t2.TICKET_ISSUE_DT, /*ticket issue dt*/
t2.TICKET_NBR, /*ticket number*/
t2.TKT_CART_NBR, /*keep*/
--t2.TKT_MKT_AIRLN_IATA_CD, /*should we limit to AA?*/
t2.ANCLRY_SLS_AMT_CONV_USD, /*sales amount usd*/
t2.PRAS_DOC_STATUS_DESC, /*status*/
t2.ANCLRY_PROD_GROUP_CD /*program code*/
--t2.SPCL_SRVC_CD /*seat or null*/
FROM PROD_MERCH_VW.MERCH_ANCLRY_SLS t2
--WHERE 1=1
--AND t2.PNR_LOCTR_ID IS NOT NULL
--AND t2.ANCLRY_SLS_AIRLN_ACCT_CD = '001'
--AND COALESCE(t2.PRAS_DOC_STATUS_DESC,'OTHER') <> 'REFUNDED'
--AND t2.SEG_DEP_DT BETWEEN CURRENT_DATE-360 AND CURRENT_DATE
--AND COALESCE(t2.ANCLRY_PRD_PRICE_LCL_CRNCY_CD,0) = 'USD'
--AND t2.PARTY_ID = '159011984'
--AND t2.ticket_nbr = '2386605942'
)s1
---------------------------GROUP BY FOR MERCH TABLE S2-----------------------
GROUP BY 1,2,3,4,5,6,7)s2
---------------------------JOIN TO LYLTY TRANSACTION TABLE----------------
ON t1.TICKET_NBR = s2.TICKET_NBR
AND t1.TICKET_ISSUE_DT = s2.TICKET_ISSUE_DT
AND t1.PARTY_ID = s2.PARTY_ID
AND t1.SEG_DEP_AIRPRT_IATA_CD = s2.SEG_DEP_AIRPRT_IATA_CD
AND t1.SEG_ARVL_AIRPRT_IATA_CD = s2.SEG_ARVL_AIRPRT_IATA_CD


LEFT JOIN PROD_PNR_BOOKING_VWS.PNR_BUSINES_LEISR_TAG_ML t2


ON t1.PNR_LOCTR_ID = t2.PNR_LOCTR_ID
AND t1.PNR_CREATE_DT = t2.PNR_CREATE_DT
AND t1.PNR_CREATE_TM = t2.PNR_CREATE_TM


INNER JOIN
(SELECT *
FROM PROD_CUST_CORE_METRICS_VW.INDVDL_CUST_PROFIL WHERE AADVAN_MBR_IND = 'Y' AND ANALYZ_START_DT <= '2020-03-31' QUALIFY ROW_NUMBER() OVER (PARTITION BY PARTY_ID ORDER BY ANALYZ_START_DT DESC)=1) t3
ON t1.PARTY_ID = t3.PARTY_ID


--------------------------- WHERE STATEMENT--------------------------------
WHERE 1=1
AND t1.PNR_LOCTR_ID IS NOT NULL
AND t1.SEG_DEP_DT BETWEEN '2019-01-01' AND '2019-12-31'
AND t1.MKT_AIRLN_IATA_CD = 'AA' /*limit to aa - both marketing, operating, accounting*/
AND t1.OPERAT_AIRLN_IATA_CD = 'AA' /*limit to aa - both marketing, operating, accounting*/
AND t1.ACCT_AIRLN_IATA_CD = 'AA' /*limit to aa - both marketing, operating, accounting*/
AND t1.SEG_INTRVL_TYPE IS NOT NULL
AND t1.COUPON_STATUS_CD = 'USED'
AND t1.LYLTY_ACCT_ID IS NOT NULL
AND t1.TKT_TYPE IN ('A','R')
AND (CASE WHEN t3.POSTAL_CD='' THEN null ELSE t3.POSTAL_CD END) IS NOT null
AND t1.SEG_DEP_DT-t1.TICKET_ISSUE_DT >= 0
AND t3.INDVDL_AGE_QTY IS NOT NULL
SAMPLE 1000;"""
In [6]:
# Reading data from sql to dataframe 
df = pd.read_sql(query, con)

Step 1: Describe the Data¶

In [7]:
# set limits to output
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
In [8]:
# data sample
df.head()
Out[8]:
PARTY_ID LYLTY_ACCT_ID loyalty_Ind LYLTY_LEVEL_CD MACRO_SEG_ID INDVDL_AGE_QTY GENDER_CD POSTAL_CD BKGS_QTY AADVAN_MBR_TTL_YEARS_QTY AA_OPER_REVNUE_SEG_AMT AA_OPER_REVNUE_SEG_QTY ACRU_AEM_QTY ACRU_FLIGHT_AEM_QTY ACRU_FLIGHT_SEG_QTY BARCLAYS_CARD_HOLDER_IND BKGS_INTL_QTY BKGS_PAX_QTY_AVG CHECK_BAGS_PER_TRIP_AVG CITIBNK_PREMIM_CARD_HLDR_IND FLOWN_SEG_QTY LOUNGE_MBR_PRTCPNT_IND TRIP_QTY aa_value_code booking_month PNR_LOCTR_ID PNR_CREATE_DT TICKET_NBR TICKET_ISSUE_DT SEG_DEP_DT days_before_dept day_of_week_dep dep_month SEG_DEP_AIRPRT_IATA_CD SEG_ARVL_AIRPRT_IATA_CD MKT_N_DIRECTN_AIRPRT_PAIR_CD DEP_CNTRY_CD ARVL_CNTRY_CD AIRPRT_IATA_GRP AIRPRT_IATA_TOP TACTL_ENTITY SEG_INTRVL_TYPE MILE_GREAT_CIRCLE_DISTANC_QTY SEG_GEOG_AREA_CD SEG_GEOG_AREA_DESC CABIN_BOOKED CABIN_FLOWN UPG_TYPE upgrade_ind ACCT_FARE_CLASS_CD ACCT_FARE_CLASS_TYPE TKT_TYPE BX_IND BKG_BUSINES_LEISR_IND Spend_Ind FLIGHT_REV ancillary_spend_ind ancll_rev_ttl ancll_qty_ttl rev_pref_seats qty_pref_seats rev_bag qty_bag rev_bag_non_cat qty_bag_non_cat rev_mileage_multiplier qty_mileage_multiplier rev_tsa qty_tsa rev_upgrade qty_upgrade rev_other_non_cat qty_other_non_cat
0 1.481332e+11 0D30MH6 1 G 1 54 M 19701 8.5 23.0 2646.99 17.0 40475.0 19200.5 17.0 Y 1.5 3.0 16.0 N 17.0 N 15.0 High Value 8 QBBXIF 2019-07-10 7.405893e+09 2019-08-30 2019-09-04 5 4 9 PHL SFO PHLSFO US US MAJOR HUB PHL NE-H2H ... D 2521 None None Y Y NONE 0 Q Economy R N B 1 169.30 0 0.00 0 0.00 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0
1 5.454613e+09 788EVT2 1 R 3 39 M 75215 12.5 13.0 3279.76 18.5 18461.5 18461.5 19.0 N 2.5 2.0 6.0 N 18.5 N 19.5 High Value 10 AESCRA 2019-10-25 2.385458e+09 2019-10-25 2019-11-22 28 6 11 DFW CLE CLEDFW US US MAJOR HUB DFW MA-VA/WV/PA/OH ... D 1021 D US48 Y Y NONE 0 B Basic Economy R N L 1 133.95 1 37.82 1 37.82 1 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0
2 1.151162e+10 AMY5812 1 P 1 61 M 60120 14.5 32.0 3131.88 16.0 61140.0 25108.0 16.0 N 0.0 2.0 52.0 Y 16.0 N 15.0 High Value 9 MBHGQB 2019-09-06 7.454238e+09 2019-09-06 2019-09-09 3 2 9 CLT GSP CLTGSP US US MAJOR HUB CLT S-SOUTH ... D 75 None None Y Y NONE 0 L Economy R N B 1 94.64 0 0.00 0 0.00 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0
3 5.692366e+10 3F7UX02 1 R 4 28 F 02472 5.5 11.0 690.97 6.5 4130.0 4130.0 7.0 N 0.0 2.0 12.0 N 11.5 N 6.5 Medium Value 10 SQDKMS 2019-10-10 2.382890e+09 2019-10-10 2019-12-24 75 3 12 BOS PHL BOSPHL US US SPOKE SPOKE NE-H2H ... D 280 None None Y Y NONE 0 B Basic Economy R N L 1 85.83 0 0.00 0 0.00 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0
4 2.362382e+11 27YK4X2 1 G 1 38 F 77077 28.5 4.0 11221.60 59.0 125785.5 112285.5 59.0 N 1.0 2.0 51.0 N 64.0 Y 45.0 High Value 5 XUQCRL 2019-05-20 7.362932e+09 2019-05-21 2019-06-07 17 6 6 PHL IAH IAHPHL US US MAJOR HUB PHL NWC-TEXAS ... D 1324 None None Y Y NONE 0 S Economy R N B 1 136.74 0 0.00 0 0.00 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0
In [9]:
#Data Type
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 73 columns):
PARTY_ID                         1000 non-null float64
LYLTY_ACCT_ID                    1000 non-null object
loyalty_Ind                      1000 non-null int64
LYLTY_LEVEL_CD                   1000 non-null object
MACRO_SEG_ID                     1000 non-null int64
INDVDL_AGE_QTY                   1000 non-null int64
GENDER_CD                        1000 non-null object
POSTAL_CD                        1000 non-null object
BKGS_QTY                         1000 non-null float64
AADVAN_MBR_TTL_YEARS_QTY         1000 non-null float64
AA_OPER_REVNUE_SEG_AMT           1000 non-null float64
AA_OPER_REVNUE_SEG_QTY           1000 non-null float64
ACRU_AEM_QTY                     1000 non-null float64
ACRU_FLIGHT_AEM_QTY              1000 non-null float64
ACRU_FLIGHT_SEG_QTY              1000 non-null float64
BARCLAYS_CARD_HOLDER_IND         1000 non-null object
BKGS_INTL_QTY                    1000 non-null float64
BKGS_PAX_QTY_AVG                 1000 non-null float64
CHECK_BAGS_PER_TRIP_AVG          1000 non-null float64
CITIBNK_PREMIM_CARD_HLDR_IND     1000 non-null object
FLOWN_SEG_QTY                    1000 non-null float64
LOUNGE_MBR_PRTCPNT_IND           1000 non-null object
TRIP_QTY                         1000 non-null float64
aa_value_code                    1000 non-null object
booking_month                    1000 non-null int64
PNR_LOCTR_ID                     1000 non-null object
PNR_CREATE_DT                    1000 non-null object
TICKET_NBR                       1000 non-null float64
TICKET_ISSUE_DT                  1000 non-null object
SEG_DEP_DT                       1000 non-null object
days_before_dept                 1000 non-null int64
day_of_week_dep                  1000 non-null int64
dep_month                        1000 non-null int64
SEG_DEP_AIRPRT_IATA_CD           1000 non-null object
SEG_ARVL_AIRPRT_IATA_CD          1000 non-null object
MKT_N_DIRECTN_AIRPRT_PAIR_CD     1000 non-null object
DEP_CNTRY_CD                     1000 non-null object
ARVL_CNTRY_CD                    1000 non-null object
AIRPRT_IATA_GRP                  1000 non-null object
AIRPRT_IATA_TOP                  1000 non-null object
TACTL_ENTITY                     1000 non-null object
SEG_INTRVL_TYPE                  1000 non-null object
MILE_GREAT_CIRCLE_DISTANC_QTY    1000 non-null int64
SEG_GEOG_AREA_CD                 218 non-null object
SEG_GEOG_AREA_DESC               218 non-null object
CABIN_BOOKED                     1000 non-null object
CABIN_FLOWN                      1000 non-null object
UPG_TYPE                         1000 non-null object
upgrade_ind                      1000 non-null int64
ACCT_FARE_CLASS_CD               1000 non-null object
ACCT_FARE_CLASS_TYPE             1000 non-null object
TKT_TYPE                         1000 non-null object
BX_IND                           1000 non-null object
BKG_BUSINES_LEISR_IND            1000 non-null object
Spend_Ind                        1000 non-null int64
FLIGHT_REV                       1000 non-null float64
ancillary_spend_ind              1000 non-null int64
ancll_rev_ttl                    1000 non-null float64
ancll_qty_ttl                    1000 non-null int64
rev_pref_seats                   1000 non-null float64
qty_pref_seats                   1000 non-null int64
rev_bag                          1000 non-null float64
qty_bag                          1000 non-null int64
rev_bag_non_cat                  1000 non-null float64
qty_bag_non_cat                  1000 non-null int64
rev_mileage_multiplier           1000 non-null float64
qty_mileage_multiplier           1000 non-null int64
rev_tsa                          1000 non-null float64
qty_tsa                          1000 non-null int64
rev_upgrade                      1000 non-null float64
qty_upgrade                      1000 non-null int64
rev_other_non_cat                1000 non-null float64
qty_other_non_cat                1000 non-null int64
dtypes: float64(23), int64(19), object(31)
memory usage: 570.4+ KB
In [10]:
# Unique values
df.nunique()
Out[10]:
PARTY_ID                         1000
LYLTY_ACCT_ID                    1000
loyalty_Ind                         1
LYLTY_LEVEL_CD                      6
MACRO_SEG_ID                       11
INDVDL_AGE_QTY                     76
GENDER_CD                           4
POSTAL_CD                         875
BKGS_QTY                           69
AADVAN_MBR_TTL_YEARS_QTY           39
AA_OPER_REVNUE_SEG_AMT            832
AA_OPER_REVNUE_SEG_QTY            103
ACRU_AEM_QTY                      857
ACRU_FLIGHT_AEM_QTY               802
ACRU_FLIGHT_SEG_QTY                68
BARCLAYS_CARD_HOLDER_IND            2
BKGS_INTL_QTY                      23
BKGS_PAX_QTY_AVG                    9
CHECK_BAGS_PER_TRIP_AVG            63
CITIBNK_PREMIM_CARD_HLDR_IND        2
FLOWN_SEG_QTY                     110
LOUNGE_MBR_PRTCPNT_IND              2
TRIP_QTY                           90
aa_value_code                       3
booking_month                      12
PNR_LOCTR_ID                     1000
PNR_CREATE_DT                     373
TICKET_NBR                       1000
TICKET_ISSUE_DT                   376
SEG_DEP_DT                        333
days_before_dept                  160
day_of_week_dep                     7
dep_month                          12
SEG_DEP_AIRPRT_IATA_CD            181
SEG_ARVL_AIRPRT_IATA_CD           156
MKT_N_DIRECTN_AIRPRT_PAIR_CD      510
DEP_CNTRY_CD                        5
ARVL_CNTRY_CD                       5
AIRPRT_IATA_GRP                     6
AIRPRT_IATA_TOP                    25
TACTL_ENTITY                       41
SEG_INTRVL_TYPE                     3
MILE_GREAT_CIRCLE_DISTANC_QTY     455
SEG_GEOG_AREA_CD                   10
SEG_GEOG_AREA_DESC                 10
CABIN_BOOKED                        3
CABIN_FLOWN                         3
UPG_TYPE                            7
upgrade_ind                         2
ACCT_FARE_CLASS_CD                 20
ACCT_FARE_CLASS_TYPE                4
TKT_TYPE                            2
BX_IND                              2
BKG_BUSINES_LEISR_IND               2
Spend_Ind                           2
FLIGHT_REV                        817
ancillary_spend_ind                 2
ancll_rev_ttl                      89
ancll_qty_ttl                       5
rev_pref_seats                     70
qty_pref_seats                      3
rev_bag                             1
qty_bag                             1
rev_bag_non_cat                    12
qty_bag_non_cat                     5
rev_mileage_multiplier              2
qty_mileage_multiplier              2
rev_tsa                             2
qty_tsa                             2
rev_upgrade                         3
qty_upgrade                         2
rev_other_non_cat                   2
qty_other_non_cat                   2
dtype: int64
In [11]:
# Scientific Notation to long format
pd.set_option('display.float_format', lambda x: '%.2f' % x)
In [12]:
# Data Statistics
df.describe().round().T
Out[12]:
count mean std min 25% 50% 75% max
PARTY_ID 1000.00 106372368385.00 93520325819.00 20288584.00 10547375501.00 117226425201.00 170561838801.00 282263008001.00
loyalty_Ind 1000.00 1.00 0.00 1.00 1.00 1.00 1.00 1.00
MACRO_SEG_ID 1000.00 6.00 4.00 1.00 3.00 5.00 10.00 11.00
INDVDL_AGE_QTY 1000.00 49.00 15.00 5.00 37.00 50.00 60.00 91.00
BKGS_QTY 1000.00 5.00 8.00 0.00 0.00 2.00 6.00 72.00
AADVAN_MBR_TTL_YEARS_QTY 1000.00 15.00 10.00 2.00 5.00 12.00 23.00 40.00
AA_OPER_REVNUE_SEG_AMT 1000.00 1757.00 3000.00 0.00 160.00 648.00 2053.00 29922.00
AA_OPER_REVNUE_SEG_QTY 1000.00 9.00 14.00 0.00 1.00 4.00 11.00 122.00
ACRU_AEM_QTY 1000.00 34297.00 63263.00 0.00 1326.00 7224.00 40590.00 553949.00
ACRU_FLIGHT_AEM_QTY 1000.00 15906.00 32566.00 0.00 774.00 3688.00 13632.00 336215.00
ACRU_FLIGHT_SEG_QTY 1000.00 9.00 14.00 0.00 1.00 4.00 11.00 121.00
BKGS_INTL_QTY 1000.00 1.00 3.00 0.00 0.00 0.00 0.00 68.00
BKGS_PAX_QTY_AVG 1000.00 2.00 1.00 1.00 1.00 2.00 2.00 31.00
CHECK_BAGS_PER_TRIP_AVG 1000.00 8.00 14.00 0.00 0.00 3.00 8.00 184.00
FLOWN_SEG_QTY 1000.00 10.00 15.00 0.00 2.00 5.00 13.00 126.00
TRIP_QTY 1000.00 7.00 11.00 0.00 1.00 4.00 9.00 88.00
booking_month 1000.00 6.00 3.00 1.00 3.00 6.00 9.00 12.00
TICKET_NBR 1000.00 4321811223.00 2438655194.00 2173755731.00 2350600725.00 2380559074.00 7315702002.00 9511280750.00
days_before_dept 1000.00 37.00 51.00 0.00 7.00 20.00 46.00 331.00
day_of_week_dep 1000.00 4.00 2.00 1.00 2.00 4.00 6.00 7.00
dep_month 1000.00 7.00 3.00 1.00 4.00 7.00 9.00 12.00
MILE_GREAT_CIRCLE_DISTANC_QTY 1000.00 1078.00 1060.00 67.00 408.00 802.00 1271.00 7488.00
upgrade_ind 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00
Spend_Ind 1000.00 1.00 0.00 0.00 1.00 1.00 1.00 1.00
FLIGHT_REV 1000.00 181.00 222.00 0.00 75.00 134.00 222.00 3122.00
ancillary_spend_ind 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00
ancll_rev_ttl 1000.00 7.00 19.00 0.00 0.00 0.00 0.00 125.00
ancll_qty_ttl 1000.00 0.00 1.00 0.00 0.00 0.00 0.00 5.00
rev_pref_seats 1000.00 3.00 14.00 0.00 0.00 0.00 0.00 118.00
qty_pref_seats 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 2.00
rev_bag 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
qty_bag 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
rev_bag_non_cat 1000.00 3.00 11.00 0.00 0.00 0.00 0.00 125.00
qty_bag_non_cat 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 5.00
rev_mileage_multiplier 1000.00 0.00 2.00 0.00 0.00 0.00 0.00 52.00
qty_mileage_multiplier 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00
rev_tsa 1000.00 0.00 1.00 0.00 0.00 0.00 0.00 35.00
qty_tsa 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00
rev_upgrade 1000.00 0.00 3.00 0.00 0.00 0.00 0.00 86.00
qty_upgrade 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00
rev_other_non_cat 1000.00 0.00 5.00 0.00 0.00 0.00 0.00 75.00
qty_other_non_cat 1000.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00
In [13]:
#statistical description of object values
df.describe(include=['object'])
Out[13]:
LYLTY_ACCT_ID LYLTY_LEVEL_CD GENDER_CD POSTAL_CD BARCLAYS_CARD_HOLDER_IND CITIBNK_PREMIM_CARD_HLDR_IND LOUNGE_MBR_PRTCPNT_IND aa_value_code PNR_LOCTR_ID PNR_CREATE_DT TICKET_ISSUE_DT SEG_DEP_DT SEG_DEP_AIRPRT_IATA_CD SEG_ARVL_AIRPRT_IATA_CD MKT_N_DIRECTN_AIRPRT_PAIR_CD DEP_CNTRY_CD ARVL_CNTRY_CD AIRPRT_IATA_GRP AIRPRT_IATA_TOP TACTL_ENTITY SEG_INTRVL_TYPE SEG_GEOG_AREA_CD SEG_GEOG_AREA_DESC CABIN_BOOKED CABIN_FLOWN UPG_TYPE ACCT_FARE_CLASS_CD ACCT_FARE_CLASS_TYPE TKT_TYPE BX_IND BKG_BUSINES_LEISR_IND
count 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 218 218 1000 1000 1000 1000 1000 1000 1000 1000
unique 1000 6 4 875 2 2 2 3 1000 373 376 333 181 156 510 5 5 6 25 41 3 10 10 3 3 7 20 4 2 2 2
top 0M0XM44 R M 76034 N N N Medium Value KQICJB 2019-02-05 2019-04-08 2019-01-27 DFW DFW DFWORD US US MAJOR HUB SPOKE S-SOUTH ... D D US48 Y Y NONE G Economy R N B
freq 1 619 608 6 877 787 901 425 1 9 8 9 162 166 11 929 944 409 332 80 873 189 189 909 819 867 127 878 903 910 511
In [14]:
# Getting mean null values in each column
df.isnull().sum().sort_values(ascending=True)
Out[14]:
PARTY_ID                           0
BX_IND                             0
TKT_TYPE                           0
ACCT_FARE_CLASS_TYPE               0
ACCT_FARE_CLASS_CD                 0
upgrade_ind                        0
UPG_TYPE                           0
BKG_BUSINES_LEISR_IND              0
CABIN_FLOWN                        0
MILE_GREAT_CIRCLE_DISTANC_QTY      0
SEG_INTRVL_TYPE                    0
TACTL_ENTITY                       0
AIRPRT_IATA_TOP                    0
AIRPRT_IATA_GRP                    0
ARVL_CNTRY_CD                      0
CABIN_BOOKED                       0
rev_other_non_cat                  0
Spend_Ind                          0
ancillary_spend_ind                0
qty_upgrade                        0
rev_upgrade                        0
qty_tsa                            0
rev_tsa                            0
qty_mileage_multiplier             0
rev_mileage_multiplier             0
FLIGHT_REV                         0
qty_bag_non_cat                    0
qty_bag                            0
rev_bag                            0
qty_pref_seats                     0
rev_pref_seats                     0
ancll_qty_ttl                      0
ancll_rev_ttl                      0
rev_bag_non_cat                    0
MKT_N_DIRECTN_AIRPRT_PAIR_CD       0
DEP_CNTRY_CD                       0
SEG_DEP_AIRPRT_IATA_CD             0
BARCLAYS_CARD_HOLDER_IND           0
ACRU_FLIGHT_SEG_QTY                0
ACRU_FLIGHT_AEM_QTY                0
ACRU_AEM_QTY                       0
AA_OPER_REVNUE_SEG_QTY             0
AA_OPER_REVNUE_SEG_AMT             0
AADVAN_MBR_TTL_YEARS_QTY           0
BKGS_QTY                           0
POSTAL_CD                          0
GENDER_CD                          0
INDVDL_AGE_QTY                     0
MACRO_SEG_ID                       0
LYLTY_LEVEL_CD                     0
loyalty_Ind                        0
LYLTY_ACCT_ID                      0
BKGS_INTL_QTY                      0
SEG_ARVL_AIRPRT_IATA_CD            0
BKGS_PAX_QTY_AVG                   0
CITIBNK_PREMIM_CARD_HLDR_IND       0
dep_month                          0
day_of_week_dep                    0
days_before_dept                   0
SEG_DEP_DT                         0
TICKET_ISSUE_DT                    0
TICKET_NBR                         0
CHECK_BAGS_PER_TRIP_AVG            0
qty_other_non_cat                  0
PNR_LOCTR_ID                       0
booking_month                      0
aa_value_code                      0
TRIP_QTY                           0
LOUNGE_MBR_PRTCPNT_IND             0
FLOWN_SEG_QTY                      0
PNR_CREATE_DT                      0
SEG_GEOG_AREA_CD                 782
SEG_GEOG_AREA_DESC               782
dtype: int64
In [15]:
#Data Shape
print('Total Rows',df.shape[0],'Total Features',df.shape[1])
Total Rows 1000 Total Features 73
In [16]:
# Unique Values for Categgorical Columns

print("Loyalty Level Codes:")
print(pd.unique(df['LYLTY_LEVEL_CD']))
print("---------------------")
print("Macro Segment IDs:")
print(pd.unique(df['MACRO_SEG_ID']))
print("---------------------")
print("Gender Codes:")
print(pd.unique(df['GENDER_CD']))
print("---------------------")
print("Segment Types:")
print(pd.unique(df['SEG_INTRVL_TYPE']))
print("---------------------")
print("Cabin Booked:")
print(pd.unique(df['CABIN_BOOKED']))
print("---------------------")
print("Cabin Flown:")
print(pd.unique(df['CABIN_FLOWN']))
print("---------------------")
print("Upgrade Type:")
print(pd.unique(df['UPG_TYPE']))
Loyalty Level Codes:
['G     ' 'R     ' 'P     ' 'T     ' 'E     ' 'C     ']
---------------------
Macro Segment IDs:
[ 1  3  4 10  9 11  2  8  6  5  7]
---------------------
Gender Codes:
['M         ' 'F         ' 'U         ' 'X         ']
---------------------
Segment Types:
['D' 'IL' 'IS']
---------------------
Cabin Booked:
['Y' 'C' 'W']
---------------------
Cabin Flown:
['Y   ' 'C   ' 'W   ']
---------------------
Upgrade Type:
['NONE' 'COMP' 'UPFAR' 'UPAWD' 'E500' 'EVIP' 'LFBU']

Step 2: Clean the Data¶

In [17]:
# Create Target Variable
df['Total_Revenue']= df['FLIGHT_REV'] + df['ancll_rev_ttl']

# Convert to dinamic once we have access to birth date
bins = [0,23, 39, 55, 74, 150]
labels = ['Z','Millennial', 'X', 'Boomers','Silent']
df['age_cat'] = pd.cut(df['INDVDL_AGE_QTY'], bins=bins, labels=labels, right=False)

# Dropped rows that were missing age info
df.dropna(subset = ['INDVDL_AGE_QTY'], inplace=True)

# Replace null values 
df["BKGS_PAX_QTY_AVG"].fillna(1, inplace = True)

# Replace null values 
df['UPG_TYPE'].fillna("NONE", inplace = True)

df.drop(['SEG_GEOG_AREA_CD','SEG_GEOG_AREA_DESC'], axis = 1,inplace = True)

# Replace null values 
df["BKG_BUSINES_LEISR_IND"].fillna("L", inplace = True)

# Dropped rows that were missing age info
df.dropna(subset = ['age_cat'], inplace=True)

# Dropped rows that were missing age info
df.dropna(subset = ['GENDER_CD'], inplace=True)

# Dropped rows that were missing age info
df.dropna(subset = ['FLIGHT_REV'], inplace=True)

# Replace null values 
df["TACTL_ENTITY"].fillna("None", inplace = True)

# Replace null values 
df["POSTAL_CD"].fillna("None", inplace = True)

# Removing Whitespace
df["BKG_BUSINES_LEISR_IND"] = df["BKG_BUSINES_LEISR_IND"].str.strip()
df["SEG_ARVL_AIRPRT_IATA_CD"] = df["SEG_ARVL_AIRPRT_IATA_CD"].str.strip()
df["SEG_DEP_AIRPRT_IATA_CD"] = df["SEG_DEP_AIRPRT_IATA_CD"].str.strip()
df["CABIN_BOOKED"] = df["CABIN_BOOKED"].str.strip()
df["CABIN_FLOWN"] = df["CABIN_FLOWN"].str.strip()
df["DEP_CNTRY_CD"] = df["DEP_CNTRY_CD"].str.strip()
df["ARVL_CNTRY_CD"] = df["ARVL_CNTRY_CD"].str.strip()
df["POSTAL_CD"] = df["POSTAL_CD"].str.strip()
# Convert datetime
df['SEG_DEP_DT'] = pd.to_datetime(df['SEG_DEP_DT'])
df['TICKET_ISSUE_DT'] = pd.to_datetime(df['TICKET_ISSUE_DT'])
df['PNR_CREATE_DT'] = pd.to_datetime(df['PNR_CREATE_DT'])
#convert float to int
df["MACRO_SEG_ID"]=df["MACRO_SEG_ID"].astype(str)
df["dep_month"]=df["dep_month"].astype(str)
df["day_of_week_dep"]=df["day_of_week_dep"].astype(str)
df["PARTY_ID"]=df["PARTY_ID"].astype(int)
df["INDVDL_AGE_QTY"]=df["INDVDL_AGE_QTY"].astype(int)
df["BKGS_PAX_QTY_AVG"]=df["BKGS_PAX_QTY_AVG"].astype(int)
In [18]:
# Feature creation
#Join data with the lat/lon info
df_l=pd.read_csv('All.csv')
df_join=pd.merge(df_l, df, on='SEG_DEP_AIRPRT_IATA_CD', how='inner')
# Creating arrival city and departure city
#Creating new columns to have Coordinates
df_join.loc[:, 'city_arr'] = df_join["SEG_ARVL_AIRPRT_IATA_CD"].copy()
df_join.loc[:, 'city_dep'] = df_join["City"].copy()
#Seperating dfs
df_city = df_join[['SEG_DEP_AIRPRT_IATA_CD', 'City']]
df_city = df_join.set_index('SEG_DEP_AIRPRT_IATA_CD')['City']
# Creating Dictionary 
citi_dict = df_city.T.to_dict()
# Replacing Dictionary with values
df_join['city_arr']=df_join['city_arr'].map(citi_dict)


#Creating new columns to have Coordinates
df_join.loc[:, 'location_lat_dep'] = df_join["SEG_DEP_AIRPRT_IATA_CD"].copy()
df_join.loc[:, 'location_long_dep'] = df_join["SEG_DEP_AIRPRT_IATA_CD"].copy()
df_join.loc[:, 'location_lat_arr'] = df_join["SEG_ARVL_AIRPRT_IATA_CD"].copy()
df_join.loc[:, 'location_long_arr'] =df_join["SEG_ARVL_AIRPRT_IATA_CD"].copy()
#Seperating lat and long dfs
df_lat = df_l[['SEG_DEP_AIRPRT_IATA_CD', 'Lat']]
df_lat = df_lat.set_index('SEG_DEP_AIRPRT_IATA_CD')['Lat']
df_lon = df_l[['SEG_DEP_AIRPRT_IATA_CD', 'lon']]
df_lon = df_lon.set_index('SEG_DEP_AIRPRT_IATA_CD')['lon']
# Creating Dictionary 
lat_dict = df_lat.T.to_dict()
lon_dict = df_lon.T.to_dict()
# Replacing Dictionary with values
df_join['location_lat_dep']=df_join['location_lat_dep'].map(lat_dict)
df_join['location_long_dep']=df_join['location_long_dep'].map(lon_dict)
df_join['location_lat_arr']=df_join['location_lat_arr'].map(lat_dict)
df_join['location_long_arr']=df_join['location_long_arr'].map(lon_dict)
df=df_join
In [19]:
#dataframe
#International Flights
overseas_df = df[(df["DEP_CNTRY_CD"] != df["ARVL_CNTRY_CD"])]
#print("International Travel Dataset Size After Filtering : ", overseas_df.shape)
#Domestic Flights
US_df = df[df["DEP_CNTRY_CD"] == df["ARVL_CNTRY_CD"]]
#print("International Travel Dataset Size After Filtering : ", US_df.shape)
#Flight column indicating international or domestic flight
df.loc[(df["DEP_CNTRY_CD"] != df["ARVL_CNTRY_CD"]), 'Flight'] = 'International'
df.loc[df["DEP_CNTRY_CD"] == df["ARVL_CNTRY_CD"], 'Flight'] = 'Domestic'
In [20]:
# Data Statistics on Age 
df['INDVDL_AGE_QTY'].describe()
Out[20]:
count   1000.00
mean      48.80
std       15.00
min        5.00
25%       37.00
50%       50.00
75%       60.00
max       91.00
Name: INDVDL_AGE_QTY, dtype: float64
In [21]:
# Data Statistics on Age Category
df['age_cat'].value_counts()
Out[21]:
Boomers       342
X             331
Millennial    249
Silent         42
Z              36
Name: age_cat, dtype: int64
In [22]:
# Getting mean null values in each column
df.isnull().sum().sort_values(ascending=True)
Out[22]:
Unnamed: 0                        0
ancillary_spend_ind               0
FLIGHT_REV                        0
Spend_Ind                         0
BKG_BUSINES_LEISR_IND             0
BX_IND                            0
TKT_TYPE                          0
ACCT_FARE_CLASS_TYPE              0
ACCT_FARE_CLASS_CD                0
upgrade_ind                       0
UPG_TYPE                          0
CABIN_FLOWN                       0
CABIN_BOOKED                      0
MILE_GREAT_CIRCLE_DISTANC_QTY     0
SEG_INTRVL_TYPE                   0
TACTL_ENTITY                      0
AIRPRT_IATA_TOP                   0
AIRPRT_IATA_GRP                   0
ancll_rev_ttl                     0
ancll_qty_ttl                     0
rev_pref_seats                    0
qty_pref_seats                    0
location_lat_arr                  0
location_long_dep                 0
location_lat_dep                  0
city_dep                          0
age_cat                           0
Total_Revenue                     0
qty_other_non_cat                 0
rev_other_non_cat                 0
ARVL_CNTRY_CD                     0
qty_upgrade                       0
qty_tsa                           0
rev_tsa                           0
qty_mileage_multiplier            0
rev_mileage_multiplier            0
qty_bag_non_cat                   0
rev_bag_non_cat                   0
qty_bag                           0
rev_bag                           0
rev_upgrade                       0
location_long_arr                 0
DEP_CNTRY_CD                      0
SEG_ARVL_AIRPRT_IATA_CD           0
AA_OPER_REVNUE_SEG_AMT            0
AADVAN_MBR_TTL_YEARS_QTY          0
BKGS_QTY                          0
POSTAL_CD                         0
GENDER_CD                         0
INDVDL_AGE_QTY                    0
MACRO_SEG_ID                      0
LYLTY_LEVEL_CD                    0
loyalty_Ind                       0
LYLTY_ACCT_ID                     0
PARTY_ID                          0
lon                               0
Lat                               0
SEG_DEP_AIRPRT_IATA_CD            0
Country                           0
City                              0
Airport Name                      0
AA_OPER_REVNUE_SEG_QTY            0
MKT_N_DIRECTN_AIRPRT_PAIR_CD      0
ACRU_AEM_QTY                      0
ACRU_FLIGHT_SEG_QTY               0
dep_month                         0
day_of_week_dep                   0
days_before_dept                  0
SEG_DEP_DT                        0
TICKET_ISSUE_DT                   0
TICKET_NBR                        0
PNR_CREATE_DT                     0
PNR_LOCTR_ID                      0
ACRU_FLIGHT_AEM_QTY               0
booking_month                     0
TRIP_QTY                          0
LOUNGE_MBR_PRTCPNT_IND            0
FLOWN_SEG_QTY                     0
CITIBNK_PREMIM_CARD_HLDR_IND      0
CHECK_BAGS_PER_TRIP_AVG           0
BKGS_PAX_QTY_AVG                  0
BKGS_INTL_QTY                     0
BARCLAYS_CARD_HOLDER_IND          0
aa_value_code                     0
Flight                            0
city_arr                         74
dtype: int64

Step 3: Outlier Detection & Data Normalization¶

In [23]:
# Removing all warnings
import warnings
warnings.filterwarnings("ignore")
In [24]:
#Setting graph color themes
sns.set_palette(sns.color_palette("Set3", 10))
In [25]:
# histogram of total revenue distribution
ax = sns.distplot(df.Total_Revenue, hist=True, hist_kws={"edgecolor": 'w', "linewidth": 3}, kde_kws={"linewidth": 3}, bins=[0, 50, 100, 150, 200, 250, 300, 350, 400, 450])
# ticks 
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)
# labels and title
plt.xlabel('Total_Revenue', fontsize=9)
plt.ylabel('frequency', fontsize=9)
plt.title('Distribution of Revenue', fontsize=18);
In [26]:
#creating sub dataframe where there is an spend indication 
df_nonnullrev = df.loc[df['Spend_Ind'] == 1].copy()
In [27]:
# histogram of total revenue distribution when there is spend
ax = sns.distplot(df_nonnullrev.Total_Revenue, hist=True, hist_kws={"edgecolor": 'w', "linewidth": 3}, kde_kws={"linewidth": 3}, bins=[0, 50, 100, 150, 200, 250, 300, 350, 400, 450])
# ticks 
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)
# labels and title
plt.xlabel('Total_Revenue', fontsize=9)
plt.ylabel('frequency', fontsize=9)
plt.title('Distribution of Revenue', fontsize=18);
In [28]:
# box plot of the variable Total_Revenue
ax = sns.boxplot(df.Total_Revenue)
# xtick, label, and title
plt.xticks(fontsize=9)
plt.xlabel('Total Revenue', fontsize=9)
plt.title('Distribution of Total Revenue', fontsize=18)
Out[28]:
Text(0.5, 1.0, 'Distribution of Total Revenue')
In [29]:
# box plot of the variable Total_Revenue when there is spend
ax = sns.boxplot(df_nonnullrev.Total_Revenue)
# xtick, label, and title
plt.xticks(fontsize=9)
plt.xlabel('Total Revenue', fontsize=9)
plt.title('Distribution of Total Revenue', fontsize=18)
Out[29]:
Text(0.5, 1.0, 'Distribution of Total Revenue')
In [30]:
# Boxplot for numerical features and their distrbution
study_set = ['Total_Revenue','FLIGHT_REV','ancll_rev_ttl','BKGS_QTY','AA_OPER_REVNUE_SEG_AMT',
             'AA_OPER_REVNUE_SEG_QTY','ACRU_AEM_QTY','ACRU_FLIGHT_AEM_QTY','ACRU_FLIGHT_SEG_QTY',
             'CHECK_BAGS_PER_TRIP_AVG','FLOWN_SEG_QTY','TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(study_set):
        plt.subplot(8,5,i+1)
        sns.boxplot(y= df[variable], data=df)     
        plt.tight_layout()
        plt.title(variable)
In [31]:
# Barplot for numerical features and their distrbution
study_set = ['Total_Revenue','FLIGHT_REV','ancll_rev_ttl','BKGS_QTY','AA_OPER_REVNUE_SEG_AMT',
             'AA_OPER_REVNUE_SEG_QTY','ACRU_AEM_QTY','ACRU_FLIGHT_AEM_QTY','ACRU_FLIGHT_SEG_QTY',
             'CHECK_BAGS_PER_TRIP_AVG','FLOWN_SEG_QTY','TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(study_set):
        plt.subplot(8,5,i+1)
        sns.distplot(df[variable])     
        plt.tight_layout()
        plt.title(variable)
In [32]:
#creating new dataframe for log analysis on numerical values
df_log = df.copy()
#Adding one to all numerical values for analysing data greater than 0
df_log['log_Total_Revenue'] = np.log(df_log['Total_Revenue']+1) 
df_log['log_FLIGHT_REV'] = np.log(df_log['FLIGHT_REV']+1) 
df_log['log_ancll_rev_ttl'] = np.log(df_log['ancll_rev_ttl']+1) 
df_log['log_BKGS_QTY'] = np.log(df_log['BKGS_QTY']+1) 
df_log['log_AA_OPER_REVNUE_SEG_AMT'] = np.log(df_log['AA_OPER_REVNUE_SEG_AMT']+1) 
df_log['log_AA_OPER_REVNUE_SEG_QTY'] = np.log(df_log['AA_OPER_REVNUE_SEG_QTY']+1) 
df_log['log_ACRU_AEM_QTY'] = np.log(df_log['BKGS_QTY']+1) 
df_log['log_ACRU_FLIGHT_AEM_QTY'] = np.log(df_log['AA_OPER_REVNUE_SEG_AMT']+1) 
df_log['log_ACRU_FLIGHT_SEG_QTY'] = np.log(df_log['AA_OPER_REVNUE_SEG_QTY']+1) 
df_log['log_CHECK_BAGS_PER_TRIP_AVG'] = np.log(df_log['CHECK_BAGS_PER_TRIP_AVG']+1) 
df_log['log_FLOWN_SEG_QTY'] = np.log(df_log['FLOWN_SEG_QTY']+1) 
df_log['log_TRIP_QTY'] = np.log(df_log['TRIP_QTY']+1) 
In [33]:
# Baxplot for log numerical features and their distrbution 
log_study_set = ['log_Total_Revenue','log_FLIGHT_REV','log_ancll_rev_ttl','log_BKGS_QTY','log_AA_OPER_REVNUE_SEG_AMT',
             'log_AA_OPER_REVNUE_SEG_QTY','log_ACRU_AEM_QTY','log_ACRU_FLIGHT_AEM_QTY','log_ACRU_FLIGHT_SEG_QTY',
            'log_CHECK_BAGS_PER_TRIP_AVG','log_FLOWN_SEG_QTY','log_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(log_study_set):
        plt.subplot(8,5,i+1)
        sns.boxplot(y= df_log[variable], data=df_log)     
        plt.tight_layout()
        plt.title(variable)
In [34]:
# lineplot for numerical features and their distrbution
log_study_set = ['log_Total_Revenue','log_FLIGHT_REV','log_ancll_rev_ttl','log_BKGS_QTY','log_AA_OPER_REVNUE_SEG_AMT',
             'log_AA_OPER_REVNUE_SEG_QTY','log_ACRU_AEM_QTY','log_ACRU_FLIGHT_AEM_QTY','log_ACRU_FLIGHT_SEG_QTY',
            'log_CHECK_BAGS_PER_TRIP_AVG','log_FLOWN_SEG_QTY','log_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
a= np.arange(0, 4, 40, dtype=None)
for i, variable in enumerate(log_study_set):
        plt.subplot(8,5,i+1)
        sns.distplot(df_log[variable],bins=a)     
        plt.tight_layout()
        plt.title(variable)
In [35]:
# measuring skewness of numeric values
numericColumns = ['Total_Revenue','FLIGHT_REV','ancll_rev_ttl','BKGS_QTY','AA_OPER_REVNUE_SEG_AMT',
             'AA_OPER_REVNUE_SEG_QTY','ACRU_AEM_QTY','ACRU_FLIGHT_AEM_QTY','ACRU_FLIGHT_SEG_QTY',
             'CHECK_BAGS_PER_TRIP_AVG','FLOWN_SEG_QTY','TRIP_QTY']
skewColumns = df[numericColumns].apply(lambda x: skew(x)).sort_values(ascending=False)
In [36]:
#Listing high skew (greater than 0.5)
highSkew = skewColumns[skewColumns > 0.5]
print(highSkew)
FLIGHT_REV                6.30
Total_Revenue             6.16
CHECK_BAGS_PER_TRIP_AVG   4.97
ACRU_FLIGHT_AEM_QTY       4.16
AA_OPER_REVNUE_SEG_AMT    3.82
AA_OPER_REVNUE_SEG_QTY    3.57
ACRU_FLIGHT_SEG_QTY       3.52
ACRU_AEM_QTY              3.51
BKGS_QTY                  3.45
FLOWN_SEG_QTY             3.38
ancll_rev_ttl             3.22
TRIP_QTY                  3.18
dtype: float64
In [37]:
# HighSkew index
highSkew.index
Out[37]:
Index(['FLIGHT_REV', 'Total_Revenue', 'CHECK_BAGS_PER_TRIP_AVG',
       'ACRU_FLIGHT_AEM_QTY', 'AA_OPER_REVNUE_SEG_AMT',
       'AA_OPER_REVNUE_SEG_QTY', 'ACRU_FLIGHT_SEG_QTY', 'ACRU_AEM_QTY',
       'BKGS_QTY', 'FLOWN_SEG_QTY', 'ancll_rev_ttl', 'TRIP_QTY'],
      dtype='object')
In [38]:
# Creating new dataframe for boxcox visualization
df_boxcox = df.copy()
In [39]:
#Creating a boxcox variable for all variables with high skewness
for i in highSkew.index:
    df_boxcox[f"boxcox_{i}"] = boxcox1p(df_boxcox[i],boxcox_normmax(df_boxcox[i] + 1))
In [40]:
# show total revenue distrbution
f= plt.figure(figsize=(12,4))
sns.distplot(df['Total_Revenue'],bins=[0,100,200,300,400,500,600,700,800],color='r')
ax.set_title('Distribution of Total Revenue')
Out[40]:
Text(0.5, 1.0, 'Distribution of Total Revenue')
In [41]:
# total revenue distrbution on log total revenue
f= plt.figure(figsize=(12,4))
sns.distplot(df_log['log_Total_Revenue'],bins=[0,2,4,6,8,10,12,14,16,18,20],color='r')
ax.set_title('Distribution of log scale Total Revenue')
Out[41]:
Text(0.5, 1.0, 'Distribution of log scale Total Revenue')
In [42]:
# distribution of total revenue box cox transformed
f= plt.figure(figsize=(12,4))
sns.distplot(df_boxcox['Total_Revenue'],bins=[0,100,200,300,400,500,600,700],color='r')
ax.set_title('Distribution of Total Revenue boxcox transformed')
Out[42]:
Text(0.5, 1.0, 'Distribution of Total Revenue boxcox transformed')
In [43]:
#boxplot on boxcox dataframe for numerical data
boxcox_study_set = ['Total_Revenue','FLIGHT_REV','boxcox_ancll_rev_ttl','boxcox_BKGS_QTY','boxcox_AA_OPER_REVNUE_SEG_AMT',
             'boxcox_AA_OPER_REVNUE_SEG_QTY','boxcox_ACRU_AEM_QTY','boxcox_ACRU_FLIGHT_AEM_QTY','boxcox_ACRU_FLIGHT_SEG_QTY',
            'boxcox_CHECK_BAGS_PER_TRIP_AVG','boxcox_FLOWN_SEG_QTY','boxcox_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(boxcox_study_set):
        plt.subplot(8,5,i+1)
        sns.boxplot(y= df_boxcox[variable], data=df_boxcox)     
        plt.tight_layout()
        plt.title(variable)
In [44]:
#lineplot on boxcox dataframe for numerical data
boxcox_study_set = ['Total_Revenue','FLIGHT_REV','boxcox_ancll_rev_ttl','boxcox_BKGS_QTY','boxcox_AA_OPER_REVNUE_SEG_AMT',
             'boxcox_AA_OPER_REVNUE_SEG_QTY','boxcox_ACRU_AEM_QTY','boxcox_ACRU_FLIGHT_AEM_QTY','boxcox_ACRU_FLIGHT_SEG_QTY',
            'boxcox_CHECK_BAGS_PER_TRIP_AVG','boxcox_FLOWN_SEG_QTY','boxcox_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
a= np.arange(0, 4, 100, dtype=None)
for i, variable in enumerate(boxcox_study_set):
        plt.subplot(8,5,i+1)
        sns.distplot(df_boxcox[variable],bins=a)     
        plt.tight_layout()
        plt.title(variable)
In [45]:
#Histogram plot for total revenue on different bins
plt.hist(df['Total_Revenue'].dropna().values, bins=[0,50,100,150,200,250,300,350,400,450,500,550,600])
plt.show()
In [46]:
# Histogram plot for log total revenue on different bins
plt.hist(df_log['log_Total_Revenue'].dropna().values,bins=[0,1,2,3,4,5,6,7,8,9,10])
plt.show()
In [47]:
# Histogram plot for boxcox total revenue on different bins
plt.hist(df_boxcox['Total_Revenue'].dropna().values,bins=[0,50,100,150,200,250,300,350,400,450,500,550,600])
plt.show()
In [48]:
#df size of row and columns
df.shape[0],df.shape[1]
Out[48]:
(1000, 86)
In [49]:
# Creating a dataframe for winsorization 
df_win = df.copy()
In [50]:
# Apply one-way winsorization to the highest end of the value distribution. Because of extreme values, set the values of the outliers to the value of the 90th percentile. 
df_win['win_Total_Revenue'] = winsorize(df_win['Total_Revenue'], (0, 0.1))
df_win['win_FLIGHT_REV'] = winsorize(df_win['FLIGHT_REV'], (0, 0.1))
df_win['win_ancll_rev_ttl'] = winsorize(df_win['ancll_rev_ttl'], (0, 0.1))
df_win['win_BKGS_QTY'] = winsorize(df_win['BKGS_QTY'], (0, 0.1))
df_win['win_AA_OPER_REVNUE_SEG_AMT'] = winsorize(df_win['AA_OPER_REVNUE_SEG_AMT'], (0, 0.1))
df_win['win_AA_OPER_REVNUE_SEG_QTY'] = winsorize(df_win['AA_OPER_REVNUE_SEG_QTY'], (0, 0.1))
df_win['win_ACRU_AEM_QTY'] = winsorize(df_win['ACRU_AEM_QTY'], (0, 0.1))
df_win['win_ACRU_FLIGHT_AEM_QTY'] = winsorize(df_win['ACRU_FLIGHT_AEM_QTY'], (0, 0.1))
df_win['win_ACRU_FLIGHT_SEG_QTY'] = winsorize(df_win['ACRU_FLIGHT_SEG_QTY'], (0, 0.1))
df_win['win_CHECK_BAGS_PER_TRIP_AVG'] = winsorize(df_win['CHECK_BAGS_PER_TRIP_AVG'], (0, 0.1))
df_win['win_FLOWN_SEG_QTY'] = winsorize(df_win['FLOWN_SEG_QTY'], (0, 0.1))
df_win['win_TRIP_QTY'] = winsorize(df_win['TRIP_QTY'], (0, 0.1))
In [51]:
#Boxplot on winsorized data set
win_study_set = ['win_Total_Revenue','win_FLIGHT_REV','win_ancll_rev_ttl','win_BKGS_QTY','win_AA_OPER_REVNUE_SEG_AMT',
             'win_AA_OPER_REVNUE_SEG_QTY','win_ACRU_AEM_QTY','win_ACRU_FLIGHT_AEM_QTY','win_ACRU_FLIGHT_SEG_QTY','win_CHECK_BAGS_PER_TRIP_AVG','win_FLOWN_SEG_QTY','win_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(win_study_set):
        plt.subplot(8,5,i+1)
        sns.boxplot(y= df_win[variable], data=df_win)     
        plt.tight_layout()
        plt.title(variable)
In [52]:
#lineplot on winsorized dataset
win_study_set = ['win_Total_Revenue','win_FLIGHT_REV','win_ancll_rev_ttl','win_BKGS_QTY','win_AA_OPER_REVNUE_SEG_AMT',
             'win_AA_OPER_REVNUE_SEG_QTY','win_ACRU_AEM_QTY','win_ACRU_FLIGHT_AEM_QTY','win_ACRU_FLIGHT_SEG_QTY','win_CHECK_BAGS_PER_TRIP_AVG','win_FLOWN_SEG_QTY','win_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(win_study_set):
        plt.subplot(8,5,i+1)
        sns.distplot(df_win[variable])     
        plt.tight_layout()
        plt.title(variable)
In [53]:
# z score on winsorized data and number of outliers
z_scores = zscore(df_win['win_Total_Revenue'])
for threshold in range(2,5):
    print("The score threshold is: {}".format(threshold))
    print("The indices of the outliers:")
    print(np.where(z_scores > threshold))
    print("Number of outliers is: {}".format(len((np.where(z_scores > threshold)[0]))))
The score threshold is: 3
The indices of the outliers:
(array([], dtype=int64),)
Number of outliers is: 0
The score threshold is: 4
The indices of the outliers:
(array([], dtype=int64),)
Number of outliers is: 0

Step 4: Univariate & Bivariate Analysis¶

Create subset dataframes:

In [54]:
df_subset_spend = df[df['Spend_Ind']==1]
In [55]:
df_subset_ancll_spend = df[df['ancillary_spend_ind']==1]
In [56]:
df.groupby(['LYLTY_LEVEL_CD'])['Total_Revenue'].describe().T
Out[56]:
LYLTY_LEVEL_CD C E G P R T
count 5.00 81.00 162.00 100.00 619.00 33.00
mean 356.00 275.25 205.49 233.49 163.87 189.77
std 226.44 324.97 278.23 354.59 150.86 131.29
min 121.26 0.00 0.00 0.00 0.00 0.00
25% 199.83 133.98 90.05 88.14 73.63 75.22
50% 279.84 190.13 149.31 159.39 128.40 191.63
75% 516.28 279.10 227.97 271.38 207.46 276.43
max 662.79 2299.57 2845.80 3121.78 1300.00 497.99
In [57]:
df_subset_spend.groupby(['LYLTY_LEVEL_CD'])['Total_Revenue'].describe().T
Out[57]:
LYLTY_LEVEL_CD C E G P R T
count 5.00 77.00 147.00 93.00 560.00 30.00
mean 356.00 289.55 226.46 251.07 181.13 208.75
std 226.44 327.06 283.88 361.72 148.42 122.15
min 121.26 61.40 14.88 46.85 17.67 44.24
25% 199.83 146.77 107.38 97.67 90.41 91.54
50% 279.84 197.07 163.59 173.02 145.35 210.69
75% 516.28 296.00 238.65 272.56 221.19 297.36
max 662.79 2299.57 2845.80 3121.78 1300.00 497.99
In [58]:
df_subset_ancll_spend.groupby(['LYLTY_LEVEL_CD'])['Total_Revenue'].describe().T
Out[58]:
LYLTY_LEVEL_CD G R
count 16.00 159.00
mean 229.75 201.63
std 88.33 135.03
min 103.10 24.38
25% 172.08 116.16
50% 216.28 169.07
75% 277.28 252.69
max 425.52 922.55
In [59]:
df.groupby(['LYLTY_LEVEL_CD'])['FLIGHT_REV'].describe().T
Out[59]:
LYLTY_LEVEL_CD C E G P R T
count 5.00 81.00 162.00 100.00 619.00 33.00
mean 356.00 275.25 200.63 233.49 153.27 189.77
std 226.44 324.97 278.09 354.59 146.66 131.29
min 121.26 0.00 0.00 0.00 0.00 0.00
25% 199.83 133.98 86.75 88.14 67.59 75.22
50% 279.84 190.13 139.97 159.39 120.95 191.63
75% 516.28 279.10 225.13 271.38 195.01 276.43
max 662.79 2299.57 2845.80 3121.78 1300.00 497.99
In [60]:
df_subset_spend.groupby(['LYLTY_LEVEL_CD'])['FLIGHT_REV'].describe().T
Out[60]:
LYLTY_LEVEL_CD C E G P R T
count 5.00 77.00 147.00 93.00 560.00 30.00
mean 356.00 289.55 221.10 251.07 169.42 208.75
std 226.44 327.06 284.11 361.72 145.04 122.15
min 121.26 61.40 14.88 46.85 0.00 44.24
25% 199.83 146.77 105.50 97.67 80.76 91.54
50% 279.84 197.07 149.77 173.02 131.58 210.69
75% 516.28 296.00 230.49 272.56 209.14 297.36
max 662.79 2299.57 2845.80 3121.78 1300.00 497.99
In [61]:
df_subset_ancll_spend.groupby(['LYLTY_LEVEL_CD'])['FLIGHT_REV'].describe().T
Out[61]:
LYLTY_LEVEL_CD G R
count 16.00 159.00
mean 180.52 160.39
std 84.48 123.30
min 76.28 0.00
25% 120.23 81.40
50% 144.66 134.40
75% 236.21 211.02
max 373.95 815.68
In [62]:
df.groupby(['LYLTY_LEVEL_CD'])['ancll_rev_ttl'].describe().T
Out[62]:
LYLTY_LEVEL_CD C E G P R T
count 5.00 81.00 162.00 100.00 619.00 33.00
mean 0.00 0.00 4.86 0.00 10.59 0.00
std 0.00 0.00 16.07 0.00 22.15 0.00
min 0.00 0.00 0.00 0.00 0.00 0.00
25% 0.00 0.00 0.00 0.00 0.00 0.00
50% 0.00 0.00 0.00 0.00 0.00 0.00
75% 0.00 0.00 0.00 0.00 10.38 0.00
max 0.00 0.00 109.30 0.00 125.00 0.00
In [63]:
df_subset_spend.groupby(['LYLTY_LEVEL_CD'])['ancll_rev_ttl'].describe().T
Out[63]:
LYLTY_LEVEL_CD C E G P R T
count 5.00 77.00 147.00 93.00 560.00 30.00
mean 0.00 0.00 5.36 0.00 11.71 0.00
std 0.00 0.00 16.80 0.00 23.00 0.00
min 0.00 0.00 0.00 0.00 0.00 0.00
25% 0.00 0.00 0.00 0.00 0.00 0.00
50% 0.00 0.00 0.00 0.00 0.00 0.00
75% 0.00 0.00 0.00 0.00 24.54 0.00
max 0.00 0.00 109.30 0.00 125.00 0.00
In [64]:
df_subset_ancll_spend.groupby(['LYLTY_LEVEL_CD'])['ancll_rev_ttl'].describe().T
Out[64]:
LYLTY_LEVEL_CD G R
count 16.00 159.00
mean 49.23 41.24
std 21.02 25.43
min 26.82 6.63
25% 34.75 30.00
50% 40.00 30.00
75% 59.93 42.47
max 109.30 125.00
In [65]:
df.groupby(['age_cat'])['INDVDL_AGE_QTY'].describe().T
Out[65]:
age_cat Z Millennial X Boomers Silent
count 36.00 249.00 331.00 342.00 42.00
mean 16.94 32.00 47.46 62.11 77.81
std 4.93 4.38 4.52 4.98 4.01
min 5.00 23.00 39.00 55.00 74.00
25% 16.00 28.00 44.00 58.00 75.00
50% 19.00 32.00 48.00 62.00 77.00
75% 20.25 36.00 52.00 66.00 78.00
max 22.00 38.00 54.00 73.00 91.00
In [66]:
df_subset_spend.groupby(['age_cat'])['INDVDL_AGE_QTY'].describe().T
Out[66]:
age_cat Z Millennial X Boomers Silent
count 31.00 232.00 311.00 304.00 34.00
mean 16.52 32.02 47.46 61.96 78.15
std 5.11 4.34 4.54 4.97 4.35
min 5.00 23.00 39.00 55.00 74.00
25% 14.50 28.00 44.00 58.00 75.00
50% 19.00 32.00 48.00 61.00 77.00
75% 20.00 36.00 52.00 66.00 79.00
max 22.00 38.00 54.00 73.00 91.00
In [67]:
df_subset_ancll_spend.groupby(['age_cat'])['INDVDL_AGE_QTY'].describe().T
Out[67]:
age_cat Z Millennial X Boomers Silent
count 7.00 45.00 57.00 56.00 10.00
mean 16.00 31.80 47.82 62.25 81.00
std 6.68 3.97 4.52 5.25 5.25
min 5.00 24.00 39.00 55.00 74.00
25% 12.50 29.00 45.00 58.00 78.25
50% 20.00 32.00 48.00 62.00 79.50
75% 20.50 35.00 52.00 66.00 81.75
max 21.00 38.00 54.00 73.00 91.00
In [68]:
df.groupby(['age_cat'])['Total_Revenue'].describe().T
Out[68]:
age_cat Z Millennial X Boomers Silent
count 36.00 249.00 331.00 342.00 42.00
mean 142.77 170.25 217.48 178.85 183.93
std 116.71 159.77 294.59 183.85 227.24
min 0.00 0.00 0.00 0.00 0.00
25% 68.23 73.37 99.45 79.95 57.01
50% 138.20 130.50 163.59 136.14 121.12
75% 174.18 208.37 244.50 227.28 239.73
max 532.51 1226.28 3121.78 1534.95 1300.00
In [69]:
df_subset_spend.groupby(['age_cat'])['Total_Revenue'].describe().T
Out[69]:
age_cat Z Millennial X Boomers Silent
count 31.00 232.00 311.00 304.00 34.00
mean 165.80 182.72 231.47 201.21 227.21
std 109.29 158.47 298.55 183.10 232.44
min 25.00 14.88 23.48 17.67 32.00
25% 100.40 85.03 108.83 94.62 80.13
50% 153.32 149.24 169.52 148.75 171.76
75% 181.52 225.36 254.00 238.70 295.87
max 532.51 1226.28 3121.78 1534.95 1300.00
In [70]:
df_subset_ancll_spend.groupby(['age_cat'])['Total_Revenue'].describe().T
Out[70]:
age_cat Z Millennial X Boomers Silent
count 7.00 45.00 57.00 56.00 10.00
mean 126.09 183.66 224.42 210.34 201.69
std 65.23 106.06 153.28 134.88 97.32
min 25.00 30.00 30.00 24.38 74.68
25% 95.90 111.86 137.88 121.77 117.69
50% 112.79 168.21 176.98 175.59 195.46
75% 164.85 199.30 270.27 275.35 295.87
max 223.33 565.49 922.55 742.47 326.92
In [71]:
df.groupby(['age_cat'])['FLIGHT_REV'].describe().T
Out[71]:
age_cat Z Millennial X Boomers Silent
count 36.00 249.00 331.00 342.00 42.00
mean 136.28 163.88 210.37 171.26 170.15
std 117.79 159.36 293.57 181.72 228.00
min 0.00 0.00 0.00 0.00 0.00
25% 62.80 68.12 92.45 73.07 45.58
50% 129.37 126.51 152.80 127.43 90.50
75% 167.97 199.83 234.22 216.62 222.13
max 532.51 1226.28 3121.78 1534.95 1300.00
In [72]:
df_subset_spend.groupby(['age_cat'])['FLIGHT_REV'].describe().T
Out[72]:
age_cat Z Millennial X Boomers Silent
count 31.00 232.00 311.00 304.00 34.00
mean 158.26 175.89 223.90 192.67 210.18
std 112.22 158.57 297.84 181.73 236.47
min 0.00 0.00 0.00 0.00 0.00
25% 79.90 77.15 107.38 90.45 74.92
50% 153.32 135.57 161.64 143.72 157.20
75% 171.30 214.65 244.50 228.61 240.93
max 532.51 1226.28 3121.78 1534.95 1300.00
In [73]:
df_subset_ancll_spend.groupby(['age_cat'])['FLIGHT_REV'].describe().T
Out[73]:
age_cat Z Millennial X Boomers Silent
count 7.00 45.00 57.00 56.00 10.00
mean 92.70 148.44 183.13 164.01 143.80
std 58.27 102.23 138.31 120.48 102.43
min 0.00 0.00 0.00 0.00 0.00
25% 69.30 74.94 107.88 82.80 57.27
50% 82.79 134.40 147.91 134.56 138.09
75% 128.97 169.30 233.10 219.76 225.73
max 169.57 480.00 815.68 624.91 296.92
In [74]:
df.groupby(['age_cat'])['ancll_rev_ttl'].describe().T
Out[74]:
age_cat Z Millennial X Boomers Silent
count 36.00 249.00 331.00 342.00 42.00
mean 6.49 6.36 7.11 7.59 13.78
std 14.39 16.03 18.76 20.45 28.09
min 0.00 0.00 0.00 0.00 0.00
25% 0.00 0.00 0.00 0.00 0.00
50% 0.00 0.00 0.00 0.00 0.00
75% 0.00 0.00 0.00 0.00 0.00
max 53.76 100.00 125.00 117.56 100.00
In [75]:
df_subset_spend.groupby(['age_cat'])['ancll_rev_ttl'].describe().T
Out[75]:
age_cat Z Millennial X Boomers Silent
count 31.00 232.00 311.00 304.00 34.00
mean 7.54 6.83 7.57 8.53 17.03
std 15.27 16.51 19.27 21.50 30.38
min 0.00 0.00 0.00 0.00 0.00
25% 0.00 0.00 0.00 0.00 0.00
50% 0.00 0.00 0.00 0.00 0.00
75% 0.00 0.00 0.00 0.00 30.00
max 53.76 100.00 125.00 117.56 100.00
In [76]:
df_subset_ancll_spend.groupby(['age_cat'])['ancll_rev_ttl'].describe().T
Out[76]:
age_cat Z Millennial X Boomers Silent
count 7.00 45.00 57.00 56.00 10.00
mean 33.39 35.22 41.29 46.33 57.89
std 12.63 20.22 25.25 27.65 27.51
min 18.91 8.11 11.08 6.63 30.00
25% 25.00 30.00 30.00 30.00 31.09
50% 30.00 30.00 30.20 30.00 56.94
75% 40.53 30.00 40.00 65.23 80.21
max 53.76 100.00 125.00 117.56 100.00
In [77]:
#Barplot frequency on each features 
categories= ['GENDER_CD','age_cat','aa_value_code','SEG_INTRVL_TYPE','CABIN_BOOKED','CABIN_FLOWN','BX_IND','BKG_BUSINES_LEISR_IND','LYLTY_LEVEL_CD','MACRO_SEG_ID',
'BARCLAYS_CARD_HOLDER_IND','CITIBNK_PREMIM_CARD_HLDR_IND','LOUNGE_MBR_PRTCPNT_IND','day_of_week_dep','dep_month','upgrade_ind','ACCT_FARE_CLASS_TYPE','TKT_TYPE','ancillary_spend_ind','DEP_CNTRY_CD','ARVL_CNTRY_CD','UPG_TYPE']
title=['GENDER_CD','age_cat','aa_value_code','SEG_INTRVL_TYPE','CABIN_BOOKED','CABIN_FLOWN','BX_IND','BKG_BUSINES_LEISR_IND','LYLTY_LEVEL_CD','MACRO_SEG_ID',
'BARCLAYS_CARD_HOLDER_IND','CITIBNK_PREMIM_CARD_HLDR_IND','LOUNGE_MBR_PRTCPNT_IND','day_of_week_dep','dep_month','upgrade_ind','ACCT_FARE_CLASS_TYPE','TKT_TYPE','ancillary_spend_ind','DEP_CNTRY_CD','ARVL_CNTRY_CD','UPG_TYPE']

plt.figure(figsize=(30 ,50))
sns.set(font_scale = 3)
sns.set_theme(style="dark")

for i, variable in enumerate(categories):
                     plt.subplot(9,3,i+1)  
                     sns.set_palette('coolwarm')
                     ax=sns.countplot(x=df[variable], data=df, order = df[variable].value_counts().index)
                     for p in ax.patches:
                           percentage = '{:.1f}%'.format(100 * p.get_height()/len(df[variable]))
                           x = p.get_x() + p.get_width()
                           y = p.get_y() + p.get_height()
                           plt.annotate(percentage, (x, y),ha='center', fontsize = 16)
                        
                     plt.tight_layout()
                     plt.title(title[i],fontsize = 16)                       
In [78]:
# Percentage of 'SEG_DEP_AIRPRT_IATA_CD' over 'SEG_DEP_AIRPRT_IATA_CD'
df['SEG_DEP_AIRPRT_IATA_CD'].value_counts().nlargest(10)/df['SEG_DEP_AIRPRT_IATA_CD'].count()*100
Out[78]:
DFW   16.20
CLT    9.90
ORD    7.30
MIA    4.80
PHL    4.70
PHX    4.10
DCA    3.70
LAX    3.10
LGA    1.60
BOS    1.60
Name: SEG_DEP_AIRPRT_IATA_CD, dtype: float64
In [79]:
# Grouping 'SEG_DEP_AIRPRT_IATA_CD' and 'SEG_DEP_AIRPRT_IATA_CD' with their counts 
df.groupby(['SEG_DEP_AIRPRT_IATA_CD']).SEG_DEP_AIRPRT_IATA_CD.value_counts().nlargest(10)
Out[79]:
SEG_DEP_AIRPRT_IATA_CD  SEG_DEP_AIRPRT_IATA_CD
DFW                     DFW                       162
CLT                     CLT                        99
ORD                     ORD                        73
MIA                     MIA                        48
PHL                     PHL                        47
PHX                     PHX                        41
DCA                     DCA                        37
LAX                     LAX                        31
BOS                     BOS                        16
LGA                     LGA                        16
Name: SEG_DEP_AIRPRT_IATA_CD, dtype: int64
In [80]:
# Percentage of 'SEG_DEP_AIRPRT_IATA_CD' over 'SEG_DEP_AIRPRT_IATA_CD' where there is  ancll spend
df_subset_ancll_spend['SEG_DEP_AIRPRT_IATA_CD'].value_counts().nlargest(10)/df_subset_ancll_spend['SEG_DEP_AIRPRT_IATA_CD'].count()*100
Out[80]:
DFW   13.71
ORD    5.71
MIA    5.14
PHX    5.14
DCA    4.57
CLT    4.57
EWR    2.86
LAX    2.86
DEN    2.29
PHL    2.29
Name: SEG_DEP_AIRPRT_IATA_CD, dtype: float64
In [81]:
# Percentage of 'SEG_ARVL_AIRPRT_IATA_CD' over 'SEG_DEP_AIRPRT_IATA_CD' where there is  ancll spend
df['SEG_ARVL_AIRPRT_IATA_CD'].value_counts().nlargest(10)/df['SEG_DEP_AIRPRT_IATA_CD'].count()*100
Out[81]:
DFW   16.60
CLT   11.20
ORD    6.90
MIA    5.40
PHX    5.20
PHL    5.20
LAX    4.70
DCA    2.80
LGA    1.80
BOS    1.40
Name: SEG_ARVL_AIRPRT_IATA_CD, dtype: float64
In [82]:
# Grouping 'SEG_ARVL_AIRPRT_IATA_CD' and 'SEG_ARVL_AIRPRT_IATA_CD' with their counts 
df.groupby(['SEG_ARVL_AIRPRT_IATA_CD']).SEG_ARVL_AIRPRT_IATA_CD.value_counts().nlargest(10)
Out[82]:
SEG_ARVL_AIRPRT_IATA_CD  SEG_ARVL_AIRPRT_IATA_CD
DFW                      DFW                        166
CLT                      CLT                        112
ORD                      ORD                         69
MIA                      MIA                         54
PHL                      PHL                         52
PHX                      PHX                         52
LAX                      LAX                         47
DCA                      DCA                         28
LGA                      LGA                         18
BOS                      BOS                         14
Name: SEG_ARVL_AIRPRT_IATA_CD, dtype: int64
In [83]:
# Percentage of 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' over 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' where there is  ancll spend
df['MKT_N_DIRECTN_AIRPRT_PAIR_CD'].value_counts().nlargest(10)/df['MKT_N_DIRECTN_AIRPRT_PAIR_CD'].count()*100
Out[83]:
ORDPHL   1.10
DFWORD   1.10
DCADFW   1.00
DFWLAX   0.90
DFWLGA   0.90
LAXORD   0.90
DFWSAN   0.90
CLTDFW   0.80
PHLPHX   0.80
DFWLAS   0.80
Name: MKT_N_DIRECTN_AIRPRT_PAIR_CD, dtype: float64
In [84]:
# Percentage of 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' over 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' where there is  ancll spend
df_subset_ancll_spend['MKT_N_DIRECTN_AIRPRT_PAIR_CD'].value_counts().nlargest(10)/df_subset_ancll_spend['MKT_N_DIRECTN_AIRPRT_PAIR_CD'].count()*100
Out[84]:
DFWLAX   1.71
BNADFW   1.71
DFWSNA   1.71
DCADFW   1.14
CLTMIA   1.14
CLTDFW   1.14
DSMPHX   1.14
DFWORD   1.14
DFWMCI   1.14
DENDFW   1.14
Name: MKT_N_DIRECTN_AIRPRT_PAIR_CD, dtype: float64
In [85]:
# Grouping 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' and 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' with their counts 
df.groupby(['MKT_N_DIRECTN_AIRPRT_PAIR_CD']).MKT_N_DIRECTN_AIRPRT_PAIR_CD.value_counts().nlargest(10)
Out[85]:
MKT_N_DIRECTN_AIRPRT_PAIR_CD  MKT_N_DIRECTN_AIRPRT_PAIR_CD
DFWORD                        DFWORD                          11
ORDPHL                        ORDPHL                          11
DCADFW                        DCADFW                          10
DFWLAX                        DFWLAX                           9
DFWLGA                        DFWLGA                           9
DFWSAN                        DFWSAN                           9
LAXORD                        LAXORD                           9
CLTDFW                        CLTDFW                           8
DFWLAS                        DFWLAS                           8
PHLPHX                        PHLPHX                           8
Name: MKT_N_DIRECTN_AIRPRT_PAIR_CD, dtype: int64
In [86]:
#Count of 'DEP_CNTRY_CD' over 'DEP_CNTRY_CD'
df['DEP_CNTRY_CD'].value_counts()/df['DEP_CNTRY_CD'].count()
Out[86]:
US      0.93
Other   0.05
MX      0.01
CA      0.01
GB      0.00
Name: DEP_CNTRY_CD, dtype: float64
In [87]:
#Count of 'DEP_CNTRY_CD' over 'DEP_CNTRY_CD' where there is ancll spend
df_subset_ancll_spend['DEP_CNTRY_CD'].value_counts()/df_subset_ancll_spend['DEP_CNTRY_CD'].count()
Out[87]:
US      0.91
Other   0.06
MX      0.02
CA      0.01
GB      0.01
Name: DEP_CNTRY_CD, dtype: float64
In [88]:
#Count of 'ARVL_CNTRY_CD' over 'ARVL_CNTRY_CD'
df['ARVL_CNTRY_CD'].value_counts()/df['ARVL_CNTRY_CD'].count()
Out[88]:
US      0.94
Other   0.04
GB      0.01
MX      0.01
CA      0.00
Name: ARVL_CNTRY_CD, dtype: float64
In [89]:
#Count of 'TACTL_ENTITY' over 'TACTL_ENTITY'
df['TACTL_ENTITY'].value_counts()/df['TACTL_ENTITY'].count()
Out[89]:
S-SOUTH                                              0.08
MA-VA/WV/PA/OH                                       0.07
NE-H2H                                               0.07
NWC-TEXAS                                            0.06
MA-H2H                                               0.05
SW-S CALIFORNIA                                      0.04
S-FLORIDA                                            0.04
MA-CAROLINAS                                         0.04
MW-MIDWEST SOUTH                                     0.04
MW-MIDWEST NORTH                                     0.04
SW-N CALIFORNIA                                      0.04
SW-SW SMALL CITIES                                   0.04
NWC-CENTRAL/MOUNTAIN                                 0.03
NE-NEW YORK                                          0.03
MIA-H2H                                              0.03
MIA-HUB                                              0.03
SW-LAS                                               0.03
MW-H2H                                               0.03
CRB - PLS                                            0.02
NW/WEST CANADA                                       0.02
NE-NORTHEAST SPOKES                                  0.02
S-ORLANDO                                            0.01
C.EUROPE                                             0.01
UK                                                   0.01
S-H2H                                                0.01
SW-HAWAII                                            0.01
C.AMERICA                                            0.01
MEX-PLE                                              0.01
MEX-BUS                                              0.01
NWC-CABIN/SKI                                        0.01
SW-H2H                                               0.01
CRB - VFR                                            0.01
NE-EAST CANADA                                       0.01
CRB - PR/VI                                          0.01
MA-DC AREA                                           0.01
PACIFIC                                              0.01
S.AMERICA                                            0.01
N.RIM                                                0.01
SPAIN                                                0.01
BRAZIL                                               0.00
AUSTRALIA/NZ                                         0.00
Name: TACTL_ENTITY, dtype: float64
In [90]:
# Grouping 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' and 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' with their counts 
df.groupby(['TACTL_ENTITY']).TACTL_ENTITY.value_counts().nlargest(10)
Out[90]:
TACTL_ENTITY                                        TACTL_ENTITY                                      
S-SOUTH                                             S-SOUTH                                               80
MA-VA/WV/PA/OH                                      MA-VA/WV/PA/OH                                        68
NE-H2H                                              NE-H2H                                                67
NWC-TEXAS                                           NWC-TEXAS                                             55
MA-H2H                                              MA-H2H                                                49
S-FLORIDA                                           S-FLORIDA                                             45
SW-S CALIFORNIA                                     SW-S CALIFORNIA                                       45
MA-CAROLINAS                                        MA-CAROLINAS                                          44
MW-MIDWEST NORTH                                    MW-MIDWEST NORTH                                      38
MW-MIDWEST SOUTH                                    MW-MIDWEST SOUTH                                      38
Name: TACTL_ENTITY, dtype: int64
In [91]:
x=df.groupby(['Spend_Ind'])['Spend_Ind'].count()
y=len(df)
percentage=((x/y)).round(2)

ratio = pd.DataFrame(percentage).T
In [92]:
# Spend distrbution
fig, ax = plt.subplots(1,1,figsize=(6, 3))

ax.barh(ratio.index, ratio[1], color='#1f77b4', alpha=1, label='Spend')
ax.barh(ratio.index, ratio[0], left=ratio[1], color='#d62728', alpha=1, label='No Spend')

ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])

for i in ratio.index:
    
    ax.annotate(f"{int(ratio[1][i]*100)}%", xy=(ratio[1][i]/2, i),va = 'center', ha='center',fontsize=20, fontweight='light', fontfamily='serif',color='black')
    ax.annotate("Spend",  xy=(ratio[1][i]/2, -0.25),va = 'center', ha='center',fontsize=20, fontweight='light', fontfamily='serif',color='black')
    
for i in ratio.index:
    ax.annotate(f"{int(ratio[0][i]*100)}%",xy=(ratio[1][i]+ratio[0][i]/2, i),va = 'center', ha='center',fontsize=20, fontweight='light', fontfamily='serif',color='black')
    ax.annotate("No Spend", xy=(ratio[1][i]+ratio[0][i]/2, -0.25),va = 'center', ha='center',fontsize=20, fontweight='light', fontfamily='serif',color='black')

fig.text(0.125,1.00,'Spend Distribution', fontfamily='serif',fontsize=15, fontweight='bold')
fig.text(0.125,.90,'92% of all transactions had spend greater than $0',fontfamily='serif',fontsize=12)  
plt.show()
In [93]:
x=df_subset_spend.groupby(['ancillary_spend_ind'])['ancillary_spend_ind'].count()
y=len(df_subset_spend)
percentage=((x/y)).round(2)

ratio = pd.DataFrame(percentage).T
In [94]:
# Ancillary spend distrbution
fig, ax = plt.subplots(1,1,figsize=(6, 3))

ax.barh(ratio.index, ratio[1], color='#1f77b4', alpha=1, label='Spend')
ax.barh(ratio.index, ratio[0], left=ratio[1], color='#d62728', alpha=1, label='No Spend')

ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])

for i in ratio.index:
    
    ax.annotate(f"{int(ratio[1][i]*100)}%", xy=(ratio[1][i]/2, i),va = 'center', ha='center',fontsize=14, fontweight='light', fontfamily='serif',color='black')
    ax.annotate("Ancillary Spend",  xy=(ratio[1][i]/2, -0.25),va = 'center', ha='center',fontsize=14, fontweight='light', fontfamily='serif',color='black')
    
for i in ratio.index:
    ax.annotate(f"{int(ratio[0][i]*100)}%",xy=(ratio[1][i]+ratio[0][i]/2, i),va = 'center', ha='center',fontsize=14, fontweight='light', fontfamily='serif',color='black')
    ax.annotate("No Ancillary Spend", xy=(ratio[1][i]+ratio[0][i]/2, -0.25),va = 'center', ha='center',fontsize=14, fontweight='light', fontfamily='serif',color='black')

fig.text(0.125,1.00,'Ancillary Spend Distribution', fontfamily='serif',fontsize=15, fontweight='bold')
fig.text(0.125,.90,'Only 20% of all transactions with spend had ancillary charges',fontfamily='serif',fontsize=12)  
plt.show()
In [95]:
flight_rev_ancl=df.groupby(['ancillary_spend_ind'])['FLIGHT_REV'].mean()
print(flight_rev_ancl)
ancillary_spend_ind
0   185.06
1   162.23
Name: FLIGHT_REV, dtype: float64
In [96]:
flight_rev_spend=df.groupby(['Spend_Ind'])['FLIGHT_REV'].mean()
print(flight_rev_spend)
Spend_Ind
0     0.00
1   198.54
Name: FLIGHT_REV, dtype: float64
In [97]:
total_rev_spend=df.groupby(['Spend_Ind'])['Total_Revenue'].mean()
print(total_rev_spend)
Spend_Ind
0     0.00
1   206.59
Name: Total_Revenue, dtype: float64
In [98]:
ancl_rev_spend=df.groupby(['Spend_Ind'])['ancll_rev_ttl'].mean()
print(ancl_rev_spend)
Spend_Ind
0   0.00
1   8.05
Name: ancll_rev_ttl, dtype: float64
In [99]:
total_rev=df.groupby(['ancillary_spend_ind','Spend_Ind'])['Total_Revenue'].mean()
print(total_rev)
ancillary_spend_ind  Spend_Ind
0                    0             0.00
                     1           207.16
1                    1           204.20
Name: Total_Revenue, dtype: float64
In [100]:
flight_rev_ancl=df.groupby(['ancillary_spend_ind'])['FLIGHT_REV'].mean()
print(flight_rev_ancl)
ancillary_spend_ind
0   185.06
1   162.23
Name: FLIGHT_REV, dtype: float64
In [101]:
ancl_rev_ancl=df.groupby(['ancillary_spend_ind'])['ancll_rev_ttl'].mean()
print(ancl_rev_ancl)
ancillary_spend_ind
0    0.00
1   41.97
Name: ancll_rev_ttl, dtype: float64
In [102]:
grouped_rev_ind = df.groupby(['Spend_Ind', 'ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'ancll_rev_ttl': ['mean']})
grouped_rev_ind.columns = ['revenue mean','revenue count','ancl mean']
grouped_rev_ind = grouped_rev_ind.reset_index()
print(grouped_rev_ind)
   Spend_Ind  ancillary_spend_ind  revenue mean  revenue count  ancl mean
0          0                    0          0.00             88       0.00
1          1                    0        207.16            737       0.00
2          1                    1        204.20            175      41.97
In [103]:
grouped_value = df.groupby(['Spend_Ind', 'ancillary_spend_ind','aa_value_code']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
grouped_value.columns = ['revenue mean','revenue count','flight mean','ancillary mean']
grouped_value = grouped_value.reset_index()
grouped_value
Out[103]:
Spend_Ind ancillary_spend_ind aa_value_code revenue mean revenue count flight mean ancillary mean
0 0 0 High Value 0.00 10 0.00 0.00
1 0 0 Medium Value 0.00 47 0.00 0.00
2 0 0 Value 0.00 31 0.00 0.00
3 1 0 High Value 282.83 192 282.83 0.00
4 1 0 Medium Value 178.22 297 178.22 0.00
5 1 0 Value 183.23 248 183.23 0.00
6 1 1 High Value 207.03 12 174.10 32.93
7 1 1 Medium Value 210.43 81 169.95 40.47
8 1 1 Value 197.64 82 152.86 44.78
In [104]:
#Total revenue, flight Revenue and ancillary revenue by customer value
# df_subset_spenda = df[df['ancillary_spend_ind']==1]
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(18,4))

plt.subplot(1,3,1)
ax = sns.barplot(x="Total_Revenue", y='aa_value_code', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.aa_value_code.value_counts().iloc[:4].index).set(title='Total Revenue by Customer Value', xlabel='Total Revenue', ylabel='Value Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)

# ##plt.xlim(0, 200)
plt.subplot(1,3,2)
ax = sns.barplot(x="FLIGHT_REV", y='aa_value_code', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.aa_value_code.value_counts().iloc[:4].index).set(title='Flight Revenue by Customer Value', xlabel='Flight Revenue', ylabel='Value Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)

plt.subplot(1,3,3)
ax = sns.barplot(x="ancll_rev_ttl", y='aa_value_code', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.aa_value_code.value_counts().iloc[:4].index).set(title='Ancillary Revenue by Customer Value', xlabel='Ancillary Revenue', ylabel='Value Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)
Out[104]:
<matplotlib.legend.Legend at 0x7f17ecf79f28>
In [105]:
grouped_loyalty = df.groupby(['Spend_Ind', 'ancillary_spend_ind','LYLTY_LEVEL_CD']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
grouped_loyalty.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
grouped_loyalty = grouped_loyalty.reset_index()
grouped_loyalty
Out[105]:
Spend_Ind ancillary_spend_ind LYLTY_LEVEL_CD ttl mean ttl count flight mean ancllry mean
0 0 0 E 0.00 4 0.00 0.00
1 0 0 G 0.00 15 0.00 0.00
2 0 0 P 0.00 7 0.00 0.00
3 0 0 R 0.00 59 0.00 0.00
4 0 0 T 0.00 3 0.00 0.00
5 1 0 C 356.00 5 356.00 0.00
6 1 0 E 289.55 77 289.55 0.00
7 1 0 G 226.06 131 226.06 0.00
8 1 0 P 251.07 93 251.07 0.00
9 1 0 R 173.00 401 173.00 0.00
10 1 0 T 208.75 30 208.75 0.00
11 1 1 G 229.75 16 180.52 49.23
12 1 1 R 201.63 159 160.39 41.24
In [106]:
#Total revenue, flight Revenue and ancillary revenue by loyalty level
sns.catplot(y='Total_Revenue',x='LYLTY_LEVEL_CD',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.LYLTY_LEVEL_CD.value_counts().iloc[:7].index).set(title='Total Revenue by Loyalty Level', 
                                                                            xlabel='Loyalty Level', ylabel='Total Revenue')
plt.ylim(0, 1000)
sns.catplot(y='FLIGHT_REV',x='LYLTY_LEVEL_CD',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.LYLTY_LEVEL_CD.value_counts().iloc[:7].index).set(title='Flight Revenue by Loyalty Level', 
                                                                            xlabel='Loyalty Level', ylabel='Flight Revenue')
plt.ylim(0, 1000)
sns.catplot(y='ancll_rev_ttl',x='LYLTY_LEVEL_CD',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.LYLTY_LEVEL_CD.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Loyalty Level', 
                                                                            xlabel='Loyalty Level', ylabel='Ancillary Revenue')
plt.ylim(0, 200)
Out[106]:
(0.0, 200.0)
In [107]:
grouped_age = df.groupby(['Spend_Ind', 'ancillary_spend_ind','age_cat','GENDER_CD']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
grouped_age.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
grouped_age = grouped_age.reset_index()
grouped_age
Out[107]:
Spend_Ind ancillary_spend_ind age_cat GENDER_CD ttl mean ttl count flight mean ancllry mean
0 0 0 Boomers F 0.00 18.00 0.00 0.00
1 0 0 Boomers M 0.00 20.00 0.00 0.00
2 0 0 Boomers U nan nan nan nan
3 0 0 Boomers X nan nan nan nan
4 0 0 Millennial F 0.00 8.00 0.00 0.00
5 0 0 Millennial M 0.00 9.00 0.00 0.00
6 0 0 Millennial U nan nan nan nan
7 0 0 Millennial X nan nan nan nan
8 0 0 Silent F 0.00 4.00 0.00 0.00
9 0 0 Silent M 0.00 4.00 0.00 0.00
10 0 0 Silent U nan nan nan nan
11 0 0 Silent X nan nan nan nan
12 0 0 X F 0.00 6.00 0.00 0.00
13 0 0 X M 0.00 14.00 0.00 0.00
14 0 0 X U nan nan nan nan
15 0 0 X X nan nan nan nan
16 0 0 Z F 0.00 2.00 0.00 0.00
17 0 0 Z M 0.00 3.00 0.00 0.00
18 0 0 Z U nan nan nan nan
19 0 0 Z X nan nan nan nan
20 0 1 Boomers F nan nan nan nan
21 0 1 Boomers M nan nan nan nan
22 0 1 Boomers U nan nan nan nan
23 0 1 Boomers X nan nan nan nan
24 0 1 Millennial F nan nan nan nan
25 0 1 Millennial M nan nan nan nan
26 0 1 Millennial U nan nan nan nan
27 0 1 Millennial X nan nan nan nan
28 0 1 Silent F nan nan nan nan
29 0 1 Silent M nan nan nan nan
30 0 1 Silent U nan nan nan nan
31 0 1 Silent X nan nan nan nan
32 0 1 X F nan nan nan nan
33 0 1 X M nan nan nan nan
34 0 1 X U nan nan nan nan
35 0 1 X X nan nan nan nan
36 0 1 Z F nan nan nan nan
37 0 1 Z M nan nan nan nan
38 0 1 Z U nan nan nan nan
39 0 1 Z X nan nan nan nan
40 1 0 Boomers F 180.68 85.00 180.68 0.00
41 1 0 Boomers M 208.77 163.00 208.77 0.00
42 1 0 Boomers U nan nan nan nan
43 1 0 Boomers X nan nan nan nan
44 1 0 Millennial F 156.87 78.00 156.87 0.00
45 1 0 Millennial M 201.15 105.00 201.15 0.00
46 1 0 Millennial U 192.40 4.00 192.40 0.00
47 1 0 Millennial X nan nan nan nan
48 1 0 Silent F 184.91 14.00 184.91 0.00
49 1 0 Silent M 311.93 10.00 311.93 0.00
50 1 0 Silent U nan nan nan nan
51 1 0 Silent X nan nan nan nan
52 1 0 X F 185.25 79.00 185.25 0.00
53 1 0 X M 255.64 171.00 255.64 0.00
54 1 0 X U 245.40 3.00 245.40 0.00
55 1 0 X X 108.78 1.00 108.78 0.00
56 1 0 Z F 198.21 13.00 198.21 0.00
57 1 0 Z M 152.77 11.00 152.77 0.00
58 1 0 Z U nan nan nan nan
59 1 0 Z X nan nan nan nan
60 1 1 Boomers F 195.86 27.00 148.73 47.13
61 1 1 Boomers M 228.19 28.00 182.05 46.14
62 1 1 Boomers U 101.63 1.00 71.63 30.00
63 1 1 Boomers X nan nan nan nan
64 1 1 Millennial F 190.48 21.00 151.97 38.51
65 1 1 Millennial M 177.68 24.00 145.35 32.34
66 1 1 Millennial U nan nan nan nan
67 1 1 Millennial X nan nan nan nan
68 1 1 Silent F 216.41 6.00 159.82 56.59
69 1 1 Silent M 179.62 4.00 119.78 59.84
70 1 1 Silent U nan nan nan nan
71 1 1 Silent X nan nan nan nan
72 1 1 X F 189.30 17.00 156.13 33.16
73 1 1 X M 239.35 40.00 194.60 44.74
74 1 1 X U nan nan nan nan
75 1 1 X X nan nan nan nan
76 1 1 Z F 109.30 5.00 79.31 29.99
77 1 1 Z M 168.06 2.00 126.18 41.88
78 1 1 Z U nan nan nan nan
79 1 1 Z X nan nan nan nan
In [108]:
#Total revenue, flight Revenue and ancillary revenue by Barcley Card 
sns.catplot(y='Total_Revenue',x='BARCLAYS_CARD_HOLDER_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.BARCLAYS_CARD_HOLDER_IND.value_counts().iloc[:7].index).set(title='Total Revenue by Card Holder Status', 
                                                                            xlabel='Barcley Card Indicator', ylabel='Total Revenue')
plt.ylim(0, 250)
sns.catplot(y='FLIGHT_REV',x='BARCLAYS_CARD_HOLDER_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.BARCLAYS_CARD_HOLDER_IND.value_counts().iloc[:7].index).set(title='Flight Revenue by Card Holder Status', 
                                                                            xlabel='Barcley Card Indicator', ylabel='Flight Revenue')
plt.ylim(0, 250)
sns.catplot(y='ancll_rev_ttl',x='BARCLAYS_CARD_HOLDER_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.BARCLAYS_CARD_HOLDER_IND.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Card Holder Status', 
                                                                            xlabel='Barcley Card Indicator', ylabel='Ancillary Revenue')
plt.ylim(0, 75)
Out[108]:
(0.0, 75.0)
In [109]:
barcley_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','BARCLAYS_CARD_HOLDER_IND']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
barcley_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
barcley_value_multiple = barcley_value_multiple.reset_index()
barcley_value_multiple
Out[109]:
Spend_Ind ancillary_spend_ind BARCLAYS_CARD_HOLDER_IND ttl mean ttl count flight mean ancllry mean
0 1 0 N 207.84 647 207.84 0.00
1 1 0 Y 202.26 90 202.26 0.00
2 1 1 N 204.22 163 162.22 41.99
3 1 1 Y 203.98 12 162.28 41.70
In [110]:
#Total revenue, flight Revenue and ancillary revenue by Citi Card holder
sns.catplot(y='Total_Revenue',x='CITIBNK_PREMIM_CARD_HLDR_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.CITIBNK_PREMIM_CARD_HLDR_IND.value_counts().iloc[:7].index).set(title='Total Revenue by Card Holder Status', 
                                                                            xlabel='Citi Card Indicator', ylabel='Total Revenue')
plt.ylim(0, 250)
sns.catplot(y='FLIGHT_REV',x='CITIBNK_PREMIM_CARD_HLDR_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.CITIBNK_PREMIM_CARD_HLDR_IND.value_counts().iloc[:7].index).set(title='Flight Revenue by Card Holder Status', 
                                                                            xlabel='Citi Card Indicator', ylabel='Flight Revenue')
plt.ylim(0, 250)
sns.catplot(y='ancll_rev_ttl',x='CITIBNK_PREMIM_CARD_HLDR_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.CITIBNK_PREMIM_CARD_HLDR_IND.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Card Holder Status', 
                                                                            xlabel='Citi Card Indicator', ylabel='Ancillary Revenue')
plt.ylim(0, 75)
Out[110]:
(0.0, 75.0)
In [111]:
citi_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','CITIBNK_PREMIM_CARD_HLDR_IND']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
citi_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
citi_value_multiple = citi_value_multiple.reset_index()
citi_value_multiple
Out[111]:
Spend_Ind ancillary_spend_ind CITIBNK_PREMIM_CARD_HLDR_IND ttl mean ttl count flight mean ancllry mean
0 1 0 N 200.24 568 200.24 0.00
1 1 0 Y 230.41 169 230.41 0.00
2 1 1 N 202.28 155 160.44 41.84
3 1 1 Y 219.10 20 176.05 43.05
In [112]:
#Total revenue, flight Revenue and ancillary revenue by trip type
# df_subset_spenda = df[df['ancillary_spend_ind']==1]
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(18,5))

plt.subplot(1,3,1)
ax = sns.barplot(x="Total_Revenue", y='BKG_BUSINES_LEISR_IND', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.BKG_BUSINES_LEISR_IND.value_counts().iloc[:3].index).set(title='Total Revenue by Trip Type', xlabel='Total Revenue', ylabel='Trip Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)

# ##plt.xlim(0, 200)
plt.subplot(1,3,2)
ax = sns.barplot(x="FLIGHT_REV", y='BKG_BUSINES_LEISR_IND', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.BKG_BUSINES_LEISR_IND.value_counts().iloc[:3].index).set(title='Flight Revenue by Trip Type', xlabel='Flight Revenue', ylabel='Trip Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)

plt.subplot(1,3,3)
ax = sns.barplot(x="ancll_rev_ttl", y='BKG_BUSINES_LEISR_IND', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.BKG_BUSINES_LEISR_IND.value_counts().iloc[:3].index).set(title='Ancillary Revenue by Trip Type', xlabel='Ancillary Revenue', ylabel='Trip Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)
Out[112]:
<matplotlib.legend.Legend at 0x7f17e17fabe0>
In [113]:
# x=df.groupby(['Spend_Ind'],['ancillary_spend_ind'])['Total_Revenue'].mean()
# print(x)

BL_multiple = df.groupby(['Spend_Ind', 'ancillary_spend_ind','BKG_BUSINES_LEISR_IND']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
BL_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
BL_multiple = BL_multiple.reset_index()
BL_multiple
Out[113]:
Spend_Ind ancillary_spend_ind BKG_BUSINES_LEISR_IND ttl mean ttl count flight mean ancllry mean
0 0 0 B 0.00 3 0.00 0.00
1 0 0 L 0.00 85 0.00 0.00
2 1 0 B 217.61 438 217.61 0.00
3 1 0 L 191.84 299 191.84 0.00
4 1 1 B 228.97 70 189.67 39.31
5 1 1 L 187.69 105 143.93 43.75
In [114]:
#Total revenue, flight Revenue and ancillary revenue by booking type
plt.figure(figsize=(16,6))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

plt.subplot(1,3,1)
ax = sns.boxplot(x="Total_Revenue", y="BX_IND",hue='ancillary_spend_ind', data=df_subset_spend, order=df_subset_spend.BX_IND.value_counts().iloc[:7].index, whis= 4).set(title='Total Revenue by Booking Type', 
      xlabel='Total Revenue', ylabel='Business Account Indicator')
plt.xlim(0, 300)
plt.legend(bbox_to_anchor=(1.05, 1.1), loc=3, borderaxespad=0.)

plt.subplot(1,3,2)
ax = sns.boxplot(x="FLIGHT_REV", y="BX_IND",hue='ancillary_spend_ind', data=df_subset_spend, order=df_subset_spend.BX_IND.value_counts().iloc[:7].index, whis= 4).set(title='Flight Revenue by Booking Type', 
      xlabel='Flight Revenue', ylabel='Business Account Indicator')
plt.xlim(0, 300)
plt.legend(bbox_to_anchor=(1.05, 1.1), loc=3, borderaxespad=0.)

plt.subplot(1,3,3)
ax = sns.boxplot(x="ancll_rev_ttl", y="BX_IND",hue='ancillary_spend_ind', data=df_subset_spend, order=df_subset_spend.BX_IND.value_counts().iloc[:7].index, whis= 4).set(title='Ancillary Revenue by Booking Type', 
      xlabel='Ancillary Revenue', ylabel='Business Account Indicator')
plt.xlim(0, 75)
plt.legend(bbox_to_anchor=(1.05, 1.1), loc=3, borderaxespad=0.)
Out[114]:
<matplotlib.legend.Legend at 0x7f17ec946a20>
In [115]:
bx_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','BX_IND']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
bx_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
bx_value_multiple = bx_value_multiple.reset_index()
bx_value_multiple
Out[115]:
Spend_Ind ancillary_spend_ind BX_IND ttl mean ttl count flight mean ancllry mean
0 1 0 N 204.02 657 204.02 0.00
1 1 0 Y 232.95 80 232.95 0.00
2 1 1 N 203.54 165 161.50 42.04
3 1 1 Y 215.18 10 174.29 40.89
In [116]:
#Total revenue, flight Revenue and ancillary revenue by ticket type
sns.catplot(y='Total_Revenue',x='TKT_TYPE',hue='ancillary_spend_ind',kind='bar', data=df,order=df.TKT_TYPE.value_counts().iloc[:7].index).set(title='Total Revenue by Ticket Type', 

                                                                                                                                              xlabel='Ticket Type', ylabel='Total Revenue')
plt.ylim(0, 250)
sns.catplot(y='FLIGHT_REV',x='TKT_TYPE',hue='ancillary_spend_ind',kind='bar', data=df, order=df.TKT_TYPE.value_counts().iloc[:7].index).set(title='Flight Revenue by Ticket Type', 
                                                                            xlabel='Ticket Type', ylabel='Flight Revenue')
plt.ylim(0, 250)
sns.catplot(y='ancll_rev_ttl',x='TKT_TYPE',hue='ancillary_spend_ind',kind='bar', data=df, order=df.TKT_TYPE.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Ticket Type', 
                                                                            xlabel='Ticket Type', ylabel='Ancillary Revenue')
plt.ylim(0, 75)
Out[116]:
(0.0, 75.0)
In [117]:
ticket_value_multiple = df.groupby(['Spend_Ind', 'ancillary_spend_ind','TKT_TYPE']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
ticket_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
ticket_value_multiple = ticket_value_multiple.reset_index()
ticket_value_multiple
Out[117]:
Spend_Ind ancillary_spend_ind TKT_TYPE ttl mean ttl count flight mean ancllry mean
0 0 0 A 0.00 88 0.00 0.00
1 1 0 R 207.16 737 207.16 0.00
2 1 1 A 49.81 9 0.00 49.81
3 1 1 R 212.57 166 171.02 41.55
In [118]:
#Total revenue, flight Revenue and ancillary revenue by cabin flown
sns.catplot(y='Total_Revenue',x='CABIN_FLOWN',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.CABIN_FLOWN.value_counts().iloc[:4].index, ci=None).set(title='Total Revenue by Cabin Flown', 
                                                                            xlabel='Cabin Flown', ylabel='Total Revenue')
plt.ylim(0, 3000)
sns.catplot(y='FLIGHT_REV',x='CABIN_FLOWN',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.CABIN_FLOWN.value_counts().iloc[:4].index, ci=None).set(title='Flight Revenue by Cabin Flown', 
                                                                            xlabel='Cabin Flow', ylabel='Flight Revenue')
plt.ylim(0, 3000)
sns.catplot(y='ancll_rev_ttl',x='CABIN_FLOWN',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.CABIN_FLOWN.value_counts().iloc[:4].index, ci=None).set(title='Ancillary Revenue by Cabin Flown', 
                                                                            xlabel='Cabin Flown', ylabel='Ancillary Revenue')
plt.ylim(0, 500)


# t1.CABIN_BOOKED, /*Y - main cabin, W - premium economy, C = business, F = first*/
# t1.CABIN_FLOWN, /*Y - main cabin, W - premium economy, C = business, F = first*/
Out[118]:
(0.0, 500.0)
In [119]:
cabin_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','CABIN_FLOWN']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
cabin_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
cabin_value_multiple = cabin_value_multiple.reset_index()
cabin_value_multiple
Out[119]:
Spend_Ind ancillary_spend_ind CABIN_FLOWN ttl mean ttl count flight mean ancllry mean
0 1 0 C 367.18 146 367.18 0.00
1 1 0 W 701.57 6 701.57 0.00
2 1 0 Y 162.15 585 162.15 0.00
3 1 1 C 413.21 5 353.66 59.55
4 1 1 Y 198.05 170 156.60 41.46
In [120]:
#Total revenue, flight Revenue and ancillary revenue by upgrage indicator
sns.catplot(y='Total_Revenue',x='upgrade_ind',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.upgrade_ind.value_counts().iloc[:4].index, ci=None).set(title='Total Revenue by Upgrade ', 
                                                                            xlabel='Upgrade Indicator', ylabel='Total Revenue')
plt.ylim(0, 700)
sns.catplot(y='FLIGHT_REV',x='upgrade_ind',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.upgrade_ind.value_counts().iloc[:4].index, ci=None).set(title='Flight Revenue by Upgrade', 
                                                                            xlabel='Upgrade Indicator', ylabel='Flight Revenue')
plt.ylim(0, 700)
sns.catplot(y='ancll_rev_ttl',x='upgrade_ind',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.upgrade_ind.value_counts().iloc[:4].index, ci=None).set(title='Ancillary Revenue by Upgrade', 
                                                                            xlabel='Upgrade Indicator', ylabel='Ancillary Revenue')
plt.ylim(0, 200)
Out[120]:
(0.0, 200.0)
In [121]:
upgrade_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','upgrade_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
upgrade_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
upgrade_value_multiple = upgrade_value_multiple.reset_index()
upgrade_value_multiple
Out[121]:
Spend_Ind ancillary_spend_ind upgrade_ind ttl mean ttl count flight mean ancllry mean
0 1 0 0 194.41 609 194.41 0.00
1 1 0 1 267.81 128 267.81 0.00
2 1 1 0 198.79 171 157.35 41.45
3 1 1 1 435.36 4 370.91 64.44
In [122]:
# Total Revenue on CITIBNK PREMIM CARD HLDR by BX_IND
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

sns.violinplot(data=df_subset_spend, x="BX_IND", y="Total_Revenue", hue="CITIBNK_PREMIM_CARD_HLDR_IND",
               split=True, inner="quart", linewidth=1,
               palette="coolwarm")
sns.despine(left=True)
plt.ylim(0, 1000)
Out[122]:
(0.0, 1000.0)
In [123]:
# Total Revenue on CITIBNK PREMIM CARD HLDR by bussiness leisure indicator
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
g = sns.catplot(
    data=df_subset_spend, kind="bar",
    x="BKG_BUSINES_LEISR_IND", y="Total_Revenue", hue="CITIBNK_PREMIM_CARD_HLDR_IND",
    ci="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("Business Leisure Indicator", "Total Revenue")
g.legend.set_title("Citibank Card Holder")
In [124]:
# Flight Revenue on CITIBNK PREMIM CARD HLDR by bussiness leisure indicator
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
g = sns.catplot(
    data=df_subset_spend, kind="bar",
    x="BKG_BUSINES_LEISR_IND", y="FLIGHT_REV", hue="CITIBNK_PREMIM_CARD_HLDR_IND",
    ci="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("Business Leisure Indicator", "Flight Revenue")
g.legend.set_title("Citibank Card Holder")
In [125]:
# Total Ancillary on CITIBNK PREMIM CARD HLDR by bussiness leisure indicator
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
g = sns.catplot(
    data=df_subset_spend, kind="bar",
    x="BKG_BUSINES_LEISR_IND", y="ancll_rev_ttl", hue="CITIBNK_PREMIM_CARD_HLDR_IND",
    ci="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("Business Leisure Indicator", "Ancillary Revenue")
g.legend.set_title("Citibank Card Holder")
In [126]:
# Total revenue on BARCLAYS CARD HOLDER
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.boxplot(x="LYLTY_LEVEL_CD", y="Total_Revenue",
            hue="BARCLAYS_CARD_HOLDER_IND",
            data=df_subset_spend)
#sns.despine(offset=10, trim=True)
plt.ylim(0, 1000)
Out[126]:
(0.0, 1000.0)
In [127]:
# Ancillary revenue on BARCLAYS CARD HOLDER
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.boxplot(x="LYLTY_LEVEL_CD", y="ancll_rev_ttl",
            hue="BARCLAYS_CARD_HOLDER_IND",
            data=df_subset_ancll_spend)
#sns.despine(offset=10, trim=True)
plt.ylim(0, 800)
Out[127]:
(0.0, 800.0)
In [128]:
# Total Revenue by Loyalty Level, age and gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

plt.figure(figsize=(16,6))
ax = sns.boxplot(x="Total_Revenue", y="age_cat",hue='GENDER_CD', data=df_subset_spend, order=df_subset_spend.age_cat.value_counts().iloc[:7].index).set(title='Total Revenue by Loyalty Level', 
      xlabel='Total Revenue', ylabel='Age Category')
# # plot distplot
# fig, ax = plt.subplots()
# sns.distplot(df_subset_spend, ax = ax)
plt.xlim(0, 500)
# # change the limits of X-axis

# plt.show()
# loc, labels = plt.xticks()
# a = graph.set_xticklabels(labels, rotation=180)
Out[128]:
(0.0, 500.0)
In [129]:
#Flight Revenue by Loyalty Level, age and gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(16,6))
ax = sns.boxplot(x="FLIGHT_REV", y="age_cat",hue='GENDER_CD', data=df_subset_spend, order=df_subset_spend.age_cat.value_counts().iloc[:7].index).set(title='Flight Revenue by Loyalty Level', 
      xlabel='Flight Revenue', ylabel='Age Category')
# # plot distplot
# fig, ax = plt.subplots()
# sns.distplot(df_subset_spend, ax = ax)
plt.xlim(0, 500)
# # change the limits of X-axis

# plt.show()
# loc, labels = plt.xticks()
# a = graph.set_xticklabels(labels, rotation=180)
Out[129]:
(0.0, 500.0)
In [130]:
# Total revenue by loyalty level 
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(16,6))
graph = sns.boxplot(x="Total_Revenue", y="LYLTY_LEVEL_CD", data=df_subset_spend, order=df_subset_spend.LYLTY_LEVEL_CD.value_counts().iloc[:7].index).set(title='Total Revenue by Loyalty Level', 
                                                                            xlabel='Total Revenue', ylabel='Loyalty Level')
# loc, labels = plt.xticks()
plt.xlim(0, 2000)
# a = graph.set_xticklabels(labels, rotation=180)
Out[130]:
(0.0, 2000.0)
In [131]:
# Flight revenue by loyalty level 
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

plt.figure(figsize=(16,6))
graph = sns.boxplot(x="FLIGHT_REV", y="LYLTY_LEVEL_CD", data=df_subset_spend)
loc, labels = plt.xticks()
a = graph.set_xticklabels(labels, rotation=180)
plt.xlim(0, 2000)
Out[131]:
(0.0, 2000.0)
In [132]:
#Total revenue, flight Revenue and ancillary revenue by age and gender

sns.set_theme(style="dark")
sns.set_palette('coolwarm')

plt.figure(figsize=(18,5))

plt.subplot(1,3,1)
sns.barplot(df_subset_spend["Total_Revenue"], df_subset_spend["age_cat"])
plt.title("Total Revenue by Gender")
plt.xlim(0, 500)

plt.subplot(1,3,2)
sns.barplot(df_subset_spend["FLIGHT_REV"],  df_subset_spend["age_cat"])
plt.title("Flight Revenue by Gender")
plt.xlim(0, 500)

plt.subplot(1,3,3)
sns.barplot(df_subset_spend["ancll_rev_ttl"],  df_subset_spend["age_cat"])
plt.title("Ancillary Revenue by Gender")
plt.xlim(0, 100)

plt.show()
In [133]:
# Flight Revenue by Age and Gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
ax = sns.barplot(x="FLIGHT_REV", y="age_cat", hue="GENDER_CD", data=df_subset_spend, ci=None, order=df_subset_spend.age_cat.value_counts().iloc[:7].index).set(title='Flight Revenue by Age and Gender', xlabel='', ylabel='')
plt.xlim(0, 500)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[133]:
<matplotlib.legend.Legend at 0x7f17e17039e8>
In [134]:
#Ancillary Revenue by Age and Gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
ax = sns.barplot(x="ancll_rev_ttl", y="age_cat", hue="GENDER_CD", data=df_subset_ancll_spend, ci=None, order=df_subset_ancll_spend.age_cat.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Age and Gender', xlabel='', ylabel='')
plt.xlim(0, 200)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[134]:
<matplotlib.legend.Legend at 0x7f17e1774710>
In [135]:
# Ancillary Revenue by Age and Gender
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

ax = sns.barplot(x='ancll_rev_ttl', y="age_cat", hue="GENDER_CD", data=df_subset_ancll_spend, ci=None, order=df_subset_ancll_spend.age_cat.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Age and Gender', xlabel='', ylabel='')
plt.xlim(0, 200)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[135]:
<matplotlib.legend.Legend at 0x7f17e1480dd8>
In [136]:
#Total Revenue by Age and Gender
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

ax = sns.barplot(x='Total_Revenue', y="age_cat", hue="GENDER_CD", data=df_subset_ancll_spend, ci=None, order=df_subset_ancll_spend.age_cat.value_counts().iloc[:7].index).set(title='Total Revenue by Age and Gender', xlabel='', ylabel='')
plt.xlim(0, 1000)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[136]:
<matplotlib.legend.Legend at 0x7f17e0105198>
In [137]:
# Fare class analysis on total revenue, flight revenue and total ancillary 
sns.catplot(y='Total_Revenue',x='ACCT_FARE_CLASS_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.ACCT_FARE_CLASS_TYPE.value_counts().iloc[:7].index, height=4, aspect=2.5, ci=None).set(title='Total Revenue by Fare Class', 
                                                                            xlabel='Fare Class', ylabel='Total Revenue')
plt.ylim(0, 3000)
sns.catplot(y='FLIGHT_REV',x='ACCT_FARE_CLASS_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.ACCT_FARE_CLASS_TYPE.value_counts().iloc[:7].index, height=4, aspect=2.5, ci=None).set(title='Flight Revenue by Fare Class', 
                                                                            xlabel='Fare Class', ylabel='Flight Revenue')
plt.ylim(0, 3000)
sns.catplot(y='ancll_rev_ttl',x='ACCT_FARE_CLASS_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.ACCT_FARE_CLASS_TYPE.value_counts().iloc[:7].index, height=4, aspect=2.5, ci=None).set(title='Ancillary Revenue by Fare Class', 
                                                                            xlabel='Fare Class', ylabel='Ancillary Revenue')
plt.ylim(0, 300)
Out[137]:
(0.0, 300.0)
In [138]:
# Travel type analysis on total revenue, flight revenue and total ancillary 
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

sns.catplot(y='Total_Revenue',x='SEG_INTRVL_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.SEG_INTRVL_TYPE.value_counts().iloc[:4].index, ci=None).set(title='Total Revenue by Segment ', 
                                                                            xlabel='Travel Type', ylabel='Total Revenue')
plt.ylim(0, 700)
sns.catplot(y='FLIGHT_REV',x='SEG_INTRVL_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.SEG_INTRVL_TYPE.value_counts().iloc[:4].index, ci=None).set(title='Flight Revenue by Segment', 
                                                                            xlabel='Travel Type', ylabel='Flight Revenue')
plt.ylim(0, 700)
sns.catplot(y='ancll_rev_ttl',x='SEG_INTRVL_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.SEG_INTRVL_TYPE.value_counts().iloc[:4].index, ci=None).set(title='Ancillary Revenue by Segment', 
                                                                            xlabel='Travel Type', ylabel='Ancillary Revenue')
plt.ylim(0, 200)
Out[138]:
(0.0, 200.0)
In [139]:
# Upgrade indicator analysis on total revenue, flight revenue and total ancillary by lounge member
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

sns.catplot(y='Total_Revenue',x='LOUNGE_MBR_PRTCPNT_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.LOUNGE_MBR_PRTCPNT_IND.value_counts().iloc[:4].index, ci=None).set(title='Total Revenue by Lounge Membership', 
                                                                            xlabel='Upgrade Indicator', ylabel='Total Revenue')
plt.ylim(0, 700)
sns.catplot(y='FLIGHT_REV',x='LOUNGE_MBR_PRTCPNT_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.LOUNGE_MBR_PRTCPNT_IND.value_counts().iloc[:4].index, ci=None).set(title='Flight Revenue by Lounge Membership', 
                                                                            xlabel='Upgrade Indicator', ylabel='Flight Revenue')
plt.ylim(0, 700)
sns.catplot(y='ancll_rev_ttl',x='LOUNGE_MBR_PRTCPNT_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.LOUNGE_MBR_PRTCPNT_IND.value_counts().iloc[:4].index, ci=None).set(title='Ancillary Revenue by Lounge Membership', 
                                                                            xlabel='Upgrade Indicator', ylabel='Ancillary Revenue')
plt.ylim(0, 200)
Out[139]:
(0.0, 200.0)
In [140]:
#Day of the week analysis on total revenue, flight revenue and total ancillary
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

plt.subplot(1,3,1)
sns.barplot(df_subset_spend["Total_Revenue"], df_subset_spend["day_of_week_dep"])
plt.title("Total Revenue by Day of Week")

plt.subplot(1,3,2)
sns.barplot(df_subset_spend["FLIGHT_REV"], df_subset_spend["day_of_week_dep"])
plt.title("Flight Revenue by Day of Week")

plt.subplot(1,3,3)
sns.barplot(df_subset_spend["ancll_rev_ttl"], df_subset_spend["day_of_week_dep"])
plt.title("Ancillary Revenue by Day of Week")

plt.show()
In [141]:
#Monthly analysis on total revenue, flight revenue and total ancillary
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')

plt.subplot(1,3,1)
sns.barplot(df_subset_spend["Total_Revenue"], df_subset_spend["dep_month"])
plt.title("Total Revenue by Month")

plt.subplot(1,3,2)
sns.barplot(df_subset_spend["FLIGHT_REV"], df_subset_spend["dep_month"])
plt.title("Flight Revenue by Month")

plt.subplot(1,3,3)
sns.barplot(df_subset_spend["ancll_rev_ttl"], df_subset_spend["dep_month"])
plt.title("Ancillary Revenue by Month")

plt.show()
In [142]:
# Plot production Revenue by month
df_monthly_mean = df_subset_spend.groupby(by="dep_month").mean()
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.bar(df_monthly_mean.index, height=df_monthly_mean['Total_Revenue'].values)
plt.ylabel('mean(Total_Revenue)')
plt.xlabel('Month')
Out[142]:
Text(0.5, 0, 'Month')
In [143]:
# Plot production Revenue by day of week
df_weekly_mean = df_subset_spend.groupby(by="day_of_week_dep").mean()
plt.figure(figsize=(15,6))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.bar(df_weekly_mean.index, height=df_weekly_mean['Total_Revenue'].values)
plt.ylabel('mean(Total_Revenue)')
plt.xlabel('Day of Week Departure')
Out[143]:
Text(0.5, 0, 'Day of Week Departure')
In [144]:
#New dataframe for scatter plot
df_scatter = df_subset_spend[['Total_Revenue',
    'FLIGHT_REV',
    'ancll_rev_ttl','rev_pref_seats','rev_bag_non_cat','rev_upgrade','days_before_dept',                
'MILE_GREAT_CIRCLE_DISTANC_QTY']].copy()            
In [145]:
# Scatter plot 
sns.set_palette(sns.color_palette("coolwarm", 16))
sns.pairplot(df_scatter,diag_kind = 'kde',
             size = 4, corner=False)
plt.show()

Corrolation Heatmap¶

In [146]:
#limiting dataframe for numerical features 
df_c=df[[
'AA_OPER_REVNUE_SEG_AMT',     
'CHECK_BAGS_PER_TRIP_AVG',          
'FLOWN_SEG_QTY',                   
'days_before_dept',                
'MILE_GREAT_CIRCLE_DISTANC_QTY',   
'FLIGHT_REV',                       
'ancll_rev_ttl',                    
'rev_pref_seats',                  
'rev_bag_non_cat',                 
'rev_other_non_cat',                
'rev_mileage_multiplier',          
'rev_tsa',                        
'rev_upgrade',                      
'Total_Revenue',
    "Spend_Ind"
]]
df_sc=df_c[(df_c["Spend_Ind"] == 1)]
df_n=df_sc[[
'AA_OPER_REVNUE_SEG_AMT',     
'CHECK_BAGS_PER_TRIP_AVG',          
'FLOWN_SEG_QTY',                   
'days_before_dept',                
'MILE_GREAT_CIRCLE_DISTANC_QTY',   
'FLIGHT_REV',                       
'ancll_rev_ttl',                    
'rev_pref_seats',                  
'rev_bag_non_cat',                 
'rev_other_non_cat',                
'rev_mileage_multiplier',          
'rev_tsa',                        
'rev_upgrade',                      
'Total_Revenue'
]]
# Plot corrolation heatmap
print("matplotlib version " + matplotlib.__version__)
print("seaborn version " + sns.__version__)
sns.set(style='white')

corr = df_n.corr()

plt.figure(figsize=(12,12))
sns.set(font_scale=1)
ax=sns.heatmap(data=corr,
            center=0,
            cmap=sns.diverging_palette(230, 10, as_cmap=True), 
            square=True, linewidth=0.5)
ax.set_title('Feature Corrolation Heatmap')
matplotlib version 3.3.4
seaborn version 0.11.2
Out[146]:
Text(0.5, 1.0, 'Feature Corrolation Heatmap')
In [147]:
plt.figure(figsize=(10,10))
sns.heatmap(df_scatter.corr(),annot=True)
plt.show()

Revenue Breakdown¶

In [148]:
#Revenue breakdown annually
dep_multiple = df_subset_spend.groupby(['Spend_Ind','dep_month','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
                                                                                   'rev_bag_non_cat': ['mean'],
                                                                                   'rev_other_non_cat':['mean'],
                                                                                   
                                                                                   'rev_upgrade': ['mean'],
                                                                                   'rev_tsa': ['mean'],
                                                                                   'rev_mileage_multiplier': ['mean'],
                                                                                   'rev_bag': ['mean'],
                                                                                   'rev_pref_seats': ['mean'],})
dep_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
dep_multiple = dep_multiple.reset_index()
dep_multiple
Out[148]:
Spend_Ind dep_month ancillary_spend_ind ttl mean ttl count flight mean ancllry mean rev_bag_non_cat rev_other_non_cat rev_upgrade rev_tsa rev_mileage_multiplier rev_bag rev_pref_seats
0 1 1 0 234.63 59 234.63 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
1 1 1 1 177.35 19 135.86 41.49 19.74 3.95 0.00 0.00 0.00 0.00 17.81
2 1 10 0 238.73 60 238.73 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
3 1 10 1 224.13 9 188.49 35.64 17.78 0.00 0.00 0.00 0.00 0.00 17.86
4 1 11 0 180.13 61 180.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
5 1 11 1 225.12 18 178.82 46.30 18.53 0.00 2.38 0.00 0.00 0.00 25.39
6 1 12 0 252.99 54 252.99 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
7 1 12 1 193.51 13 155.73 37.77 16.92 0.00 3.29 0.00 0.00 0.00 17.56
8 1 2 0 186.85 54 186.85 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
9 1 2 1 166.50 14 132.24 34.27 28.57 0.00 0.00 0.00 0.00 0.00 5.70
10 1 3 0 229.66 55 229.66 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
11 1 3 1 272.24 12 223.85 48.38 17.92 0.00 0.00 0.00 0.00 0.00 30.47
12 1 4 0 192.99 70 192.99 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
13 1 4 1 255.67 10 200.82 54.85 10.00 0.00 0.00 0.00 0.00 0.00 44.85
14 1 5 0 194.74 65 194.74 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
15 1 5 1 167.17 9 131.57 35.60 13.33 8.33 0.00 0.00 0.00 0.00 13.93
16 1 6 0 225.52 62 225.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
17 1 6 1 189.20 24 144.79 44.41 21.88 3.12 0.00 1.46 0.00 0.00 17.95
18 1 7 0 214.20 79 214.20 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
19 1 7 1 201.35 17 167.69 33.65 19.41 4.41 0.00 0.00 3.03 0.00 6.80
20 1 8 0 188.24 51 188.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
21 1 8 1 227.10 14 186.19 40.92 17.20 0.00 6.12 0.00 0.00 0.00 17.59
22 1 9 0 156.21 67 156.21 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
23 1 9 1 186.16 16 137.91 48.25 23.74 0.00 0.00 0.00 0.00 0.00 24.50
In [149]:
#Revenu breakdown weekly
dow_multiple = df_subset_spend.groupby(['Spend_Ind','day_of_week_dep','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
                                                                                   'rev_bag_non_cat': ['mean'],
                                                                                    'rev_other_non_cat':['mean'],
                                                                                   'rev_upgrade': ['mean'],
                                                                                   'rev_tsa': ['mean'],
                                                                                   'rev_mileage_multiplier': ['mean'],
                                                                                   'rev_bag': ['mean'],
                                                                                   'rev_pref_seats': ['mean'],})
dow_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
dow_multiple = dow_multiple.reset_index()
dow_multiple
Out[149]:
Spend_Ind day_of_week_dep ancillary_spend_ind ttl mean ttl count flight mean ancllry mean rev_bag_non_cat rev_other_non_cat rev_upgrade rev_tsa rev_mileage_multiplier rev_bag rev_pref_seats
0 1 1 0 247.37 93 247.37 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
1 1 1 1 232.09 29 187.63 44.45 15.86 0.00 4.43 0.00 0.00 0.00 24.16
2 1 2 0 181.06 132 181.06 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
3 1 2 1 254.24 18 207.45 46.79 27.42 0.00 0.00 0.00 2.87 0.00 16.50
4 1 3 0 206.68 101 206.68 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
5 1 3 1 202.42 26 159.71 42.71 19.62 2.88 0.00 0.00 0.00 0.00 20.21
6 1 4 0 162.98 102 162.98 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
7 1 4 1 162.52 18 126.34 36.18 22.22 0.00 2.38 0.00 0.00 0.00 11.58
8 1 5 0 227.58 117 227.58 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
9 1 5 1 182.36 27 138.42 43.94 15.02 8.33 0.00 1.30 0.00 0.00 19.28
10 1 6 0 197.91 110 197.91 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
11 1 6 1 216.33 31 175.68 40.65 18.71 0.00 0.00 0.00 0.00 0.00 21.94
12 1 7 0 242.36 82 242.36 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
13 1 7 1 177.32 26 138.62 38.70 21.16 0.00 0.00 0.00 0.00 0.00 17.54
In [150]:
#Revenu breakdown on Domestic arrival
cd_arriv_multiple = US_df.groupby(['Spend_Ind','city_arr','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
                                                                                   'rev_bag_non_cat': ['mean'],
                                                                                    'rev_other_non_cat':['mean'],
                                                                                    'rev_upgrade': ['mean'],
                                                                                   'rev_tsa': ['mean'],
                                                                                   'rev_mileage_multiplier': ['mean'],
                                                                                   'rev_bag': ['mean'],
                                                                                   'rev_pref_seats': ['mean'],})
cd_arriv_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
cd_arriv_multiple = cd_arriv_multiple.reset_index()
cd_arriv_multiple
Out[150]:
Spend_Ind city_arr ancillary_spend_ind ttl mean ttl count flight mean ancllry mean rev_bag_non_cat rev_other_non_cat rev_upgrade rev_tsa rev_mileage_multiplier rev_bag rev_pref_seats
0 0 Austin 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
1 0 Charlotte 0 0.00 3 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
2 0 Chicago 0 0.00 5 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
3 0 Cleveland 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
4 0 Dallas-Fort Worth 0 0.00 16 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
132 1 White Plains 0 357.32 1 357.32 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
133 1 Wichita 0 140.52 2 140.52 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
134 1 Wilmington 0 67.00 3 67.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
135 1 Windsor Locks 0 353.37 3 353.37 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
136 1 Yuma 0 80.72 1 80.72 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

137 rows × 14 columns

In [151]:
#Revenu breakdown on Domestic departure
cd_dept_multiple = US_df.groupby(['Spend_Ind','city_dep','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
                                                                                   'rev_bag_non_cat': ['mean'],
                                                                                    'rev_other_non_cat':['mean'],
                                                                                   'rev_upgrade': ['mean'],
                                                                                   'rev_tsa': ['mean'],
                                                                                   'rev_mileage_multiplier': ['mean'],
                                                                                   'rev_bag': ['mean'],
                                                                                   'rev_pref_seats': ['mean'],})
cd_dept_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
cd_dept_multiple = cd_dept_multiple.reset_index()
cd_dept_multiple
Out[151]:
Spend_Ind city_dep ancillary_spend_ind ttl mean ttl count flight mean ancllry mean rev_bag_non_cat rev_other_non_cat rev_upgrade rev_tsa rev_mileage_multiplier rev_bag rev_pref_seats
0 0 Aspen 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
1 0 Atlanta 0 0.00 2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
2 0 Baltimore 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
3 0 Boston 0 0.00 2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
4 0 Charlotte 0 0.00 10 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
206 1 White Plains 0 226.98 1 226.98 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
207 1 Wilmington 0 74.89 1 74.89 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
208 1 Windsor Locks 0 131.11 2 131.11 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
209 1 Windsor Locks 1 111.86 1 81.86 30.00 30.00 0.00 0.00 0.00 0.00 0.00 0.00
210 1 Yuma 0 91.06 2 91.06 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

211 rows × 14 columns

In [152]:
#Revenue breakdown on international arrival
ci_arrv_multiple = overseas_df.groupby(['Spend_Ind','city_arr','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
                                                                                   'rev_bag_non_cat': ['mean'],
                                                                                    'rev_other_non_cat':['mean'],
                                                                                   'rev_upgrade': ['mean'],
                                                                                   'rev_tsa': ['mean'],
                                                                                   'rev_mileage_multiplier': ['mean'],
                                                                                   'rev_bag': ['mean'],
                                                                                   'rev_pref_seats': ['mean'],})
ci_arrv_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
ci_arrv_multiple = ci_arrv_multiple.reset_index()
ci_arrv_multiple
Out[152]:
Spend_Ind city_arr ancillary_spend_ind ttl mean ttl count flight mean ancllry mean rev_bag_non_cat rev_other_non_cat rev_upgrade rev_tsa rev_mileage_multiplier rev_bag rev_pref_seats
0 0 Cancun 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
1 0 Charlotte 0 0.00 2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
2 0 Chicago 0 0.00 2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
3 0 Miami 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
4 0 Philadelphia 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
5 1 Basse Terre 0 303.18 2 303.18 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
6 1 Bogota 0 75.00 1 75.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
7 1 Bridgetown 1 196.00 1 166.00 30.00 30.00 0.00 0.00 0.00 0.00 0.00 0.00
8 1 Buenos Aires 0 533.00 2 533.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
9 1 Cancun 0 116.58 2 116.58 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
10 1 Charlotte 0 137.66 4 137.66 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
11 1 Charlotte 1 281.20 3 211.67 69.53 46.19 0.00 0.00 0.00 0.00 0.00 23.35
12 1 Chicago 0 811.93 1 811.93 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
13 1 Chicago 1 558.85 4 463.13 95.72 41.22 0.00 0.00 0.00 0.00 0.00 54.50
14 1 Dallas-Fort Worth 0 371.83 13 371.83 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
15 1 Dallas-Fort Worth 1 434.81 3 382.18 52.63 10.21 0.00 0.00 0.00 0.00 0.00 42.42
16 1 Guadalajara 1 119.00 1 89.00 30.00 30.00 0.00 0.00 0.00 0.00 0.00 0.00
17 1 London 0 1278.07 8 1278.07 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
18 1 London 1 363.84 2 315.00 48.83 0.00 0.00 0.00 0.00 0.00 0.00 48.83
19 1 Los Angeles 0 558.05 2 558.05 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
20 1 Madrid 0 914.00 2 914.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
21 1 Mexico City 0 283.00 1 283.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
22 1 Miami 0 268.83 18 268.83 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
23 1 Miami 1 262.86 5 209.45 53.41 34.04 0.00 0.00 0.00 0.00 0.00 19.37
24 1 Montego Bay 0 72.30 2 72.30 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
25 1 Monterrey 0 326.00 1 326.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
26 1 Monterrey 1 341.94 1 307.00 34.94 0.00 0.00 0.00 0.00 0.00 0.00 34.94
27 1 New York 0 228.71 4 228.71 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
28 1 Paris 0 878.20 1 878.20 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
29 1 Philadelphia 0 655.02 7 655.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
30 1 Phoenix 0 196.71 1 196.71 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
31 1 Puerto Vallarta 0 113.02 1 113.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
32 1 Punta Cana 1 177.79 1 120.00 57.79 0.00 0.00 0.00 0.00 0.00 0.00 57.79
33 1 San Juan 0 193.67 1 193.67 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
34 1 San Pedro Sula 0 61.50 1 61.50 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
35 1 Santiago 0 426.95 1 426.95 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
36 1 Santo Domingo 0 144.63 2 144.63 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
37 1 Tokyo 0 1036.05 1 1036.05 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
38 1 Toronto 0 247.25 2 247.25 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
39 1 Willemstad 0 90.58 1 90.58 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
In [153]:
#Revenue breakdown on most frequent routes
mk_dir_multiple = US_df.groupby(['Spend_Ind','MKT_N_DIRECTN_AIRPRT_PAIR_CD','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
                                                                                   'rev_bag_non_cat': ['mean'],
                                                                                    'rev_other_non_cat':['mean'],
                                                                                   'rev_upgrade': ['mean'],
                                                                                   'rev_tsa': ['mean'],
                                                                                   'rev_mileage_multiplier': ['mean'],
                                                                                   'rev_bag': ['mean'],
                                                                                   'rev_pref_seats': ['mean'],})
mk_dir_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
mk_dir_multiple = mk_dir_multiple.reset_index()
mk_dir_multiple
Out[153]:
Spend_Ind MKT_N_DIRECTN_AIRPRT_PAIR_CD ancillary_spend_ind ttl mean ttl count flight mean ancllry mean rev_bag_non_cat rev_other_non_cat rev_upgrade rev_tsa rev_mileage_multiplier rev_bag rev_pref_seats
0 0 ASEORD 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
1 0 ATLDFW 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
2 0 ATLPHL 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
3 0 AUSDFW 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
4 0 BDLCLT 0 0.00 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
535 1 PHXSNA 0 115.35 1 115.35 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
536 1 PHXSNA 1 144.32 1 97.56 46.76 30.00 0.00 0.00 0.00 0.00 0.00 16.76
537 1 PHXTPA 0 224.38 1 224.38 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
538 1 PHXTUS 0 103.48 4 103.48 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
539 1 PHXYUM 0 62.48 2 62.48 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00

540 rows × 14 columns

In [154]:
#Revenue breakdown on age groups
age_cat_multiple = df_subset_spend.groupby(['Spend_Ind','age_cat','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
                                                                                   'rev_bag_non_cat': ['mean'],
                                                                                    'rev_other_non_cat':['mean'],
                                                                                   'rev_upgrade': ['mean'],
                                                                                   'rev_tsa': ['mean'],
                                                                                   'rev_mileage_multiplier': ['mean'],
                                                                                   'rev_bag': ['mean'],
                                                                                   'rev_pref_seats': ['mean'],})
age_cat_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat','rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
age_cat_multiple = age_cat_multiple.reset_index()
age_cat_multiple
Out[154]:
Spend_Ind age_cat ancillary_spend_ind ttl mean ttl count flight mean ancllry mean rev_bag_non_cat rev_other_non_cat rev_upgrade rev_tsa rev_mileage_multiplier rev_bag rev_pref_seats
0 1 Z 0 177.38 24 177.38 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
1 1 Z 1 126.09 7 92.70 33.39 15.71 0.00 0.00 0.00 0.00 0.00 17.68
2 1 Millennial 0 182.50 187 182.50 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
3 1 Millennial 1 183.66 45 148.44 35.22 20.67 1.67 0.00 0.00 0.00 0.00 12.88
4 1 X 0 233.05 254 233.05 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
5 1 X 1 224.42 57 183.13 41.29 16.42 1.32 2.25 0.62 0.00 0.00 20.69
6 1 Boomers 0 199.14 248 199.14 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
7 1 Boomers 1 210.34 56 164.01 46.33 20.78 1.34 0.76 0.00 0.92 0.00 22.53
8 1 Silent 0 237.84 24 237.84 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
9 1 Silent 1 201.69 10 143.80 57.89 26.00 7.50 0.00 0.00 0.00 0.00 24.39

Total Revenue in Domestic and International Flights Monthly and Weekly¶

In [155]:
#Converting month and weekday to integers 
df_subset_ancll_spend["dep_month"]=df_subset_ancll_spend["dep_month"].astype(int)
df_subset_ancll_spend["day_of_week_dep"]=df_subset_ancll_spend["day_of_week_dep"].astype(int)
#Creating month and week column based on numbers
#month column
import calendar
df_subset_ancll_spend["month"] = df_subset_ancll_spend["dep_month"].apply(lambda x: calendar.month_name[x])
#Day column
days = {2:'Mon',3:'Tues',4:'Weds',5:'Thurs',6:'Fri',7:'Sat',1:'Sun'}
df_subset_ancll_spend['day']  = df_subset_ancll_spend['day_of_week_dep'] .apply(lambda x: days[x])
In [156]:
#Creating new dataframes to sort based on the month and week
df_ms = df_subset_ancll_spend.sort_values('dep_month', ascending = True).reset_index(drop=True)
df_ws = df_subset_ancll_spend.sort_values('day_of_week_dep', ascending = True).reset_index(drop=True)
In [157]:
# Total Revenue
# Draw Plot SEG_INTRVL_TYPE

fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='Total_Revenue', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms, sort=True,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Total Revenue')
sns.lineplot(x='day', y='Total_Revenue',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,  sort=True, palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Total Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()
In [158]:
# Flight Revenue
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='FLIGHT_REV', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Flight Revenue')
sns.lineplot(x='day', y='FLIGHT_REV',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Flight Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()
In [159]:
# ancll_rev_ttl
# Draw Plot

fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='ancll_rev_ttl', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Revenue')
sns.lineplot(x='day', y='ancll_rev_ttl',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()
In [160]:
# rev_upgrade
# Draw Plot

fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_upgrade', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Upgrade Revenue')
sns.lineplot(x='day', y='rev_upgrade',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Upgrade Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()
In [161]:
# rev_tsa
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_tsa', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary TSA Revenue')
sns.lineplot(x='day', y='rev_tsa',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary TSA Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()
In [162]:
# rev_mileage_multiplier
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_mileage_multiplier', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary MM Revenue')
sns.lineplot(x='day', y='rev_mileage_multiplier',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary MM Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()
In [163]:
# rev_bag
# Draw Plot

fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_bag', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Bag Oth Revenue')
sns.lineplot(x='day', y='rev_bag',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Bag Oth Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()
In [164]:
# rev_pref_seats
# Draw Plot

fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_pref_seats', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Seat Revenue')
sns.lineplot(x='day', y='rev_pref_seats',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Seat Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()
In [165]:
# rev_pref_seats
# Draw Plot

fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_bag_non_cat', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Bag Revenue')
sns.lineplot(x='day', y='rev_bag_non_cat',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Bag Revenue')
axes[0].set_title('Annual', fontsize=18); 
axes[1].set_title('Weekly', fontsize=18)
plt.show()

Cabin Upgared and Total Revenue¶

In [166]:
#Cabin upgrade on flighs with total revenue
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.barplot(x=df_subset_ancll_spend['SEG_INTRVL_TYPE'], y=df_subset_ancll_spend.Total_Revenue, hue=df_subset_ancll_spend.upgrade_ind,  ci=None, palette='coolwarm').set(title='Total Revenue on Domestic and International Flight by Cabin Upgrade',  ylabel='Total Revenue')
Out[166]:
[Text(0.5, 1.0, 'Total Revenue on Domestic and International Flight by Cabin Upgrade'),
 Text(0, 0.5, 'Total Revenue')]
In [167]:
#Cabin upgrade on flighs with flight revenue

sns.barplot(x=df_subset_ancll_spend['SEG_INTRVL_TYPE'], y=df_subset_ancll_spend.FLIGHT_REV, hue=df_subset_ancll_spend.upgrade_ind,  ci=None, palette='coolwarm').set(title='Total Revenue on Domestic and International Flight by Cabin Upgrade',  ylabel='Flight Revenue')
Out[167]:
[Text(0.5, 1.0, 'Total Revenue on Domestic and International Flight by Cabin Upgrade'),
 Text(0, 0.5, 'Flight Revenue')]
In [168]:
##Cabin upgrade on flighs with total ancillary revenue

sns.barplot(x=df_subset_ancll_spend['SEG_INTRVL_TYPE'], y=df_subset_ancll_spend.ancll_rev_ttl, hue=df_subset_ancll_spend.upgrade_ind,  ci=None, palette='coolwarm').set(title='Total Revenue on Domestic and International Flight by Cabin Upgrade',  ylabel='Ancillary Revenue')
Out[168]:
[Text(0.5, 1.0, 'Total Revenue on Domestic and International Flight by Cabin Upgrade'),
 Text(0, 0.5, 'Ancillary Revenue')]
In [169]:
#Frequency of flight and upgrade information - Destination City
sns.set(rc={'figure.figsize':(10,5)})
ax = sns.countplot(x="city_arr", hue="upgrade_ind" ,data=df_subset_ancll_spend , palette='coolwarm', order=df_subset_ancll_spend.city_arr.value_counts().iloc[:7].index).set(title='Frequency of Flights from US Cities with Upgraded Cabin Information', xlabel='Destination City', ylabel='Frequency')
In [170]:
#Frequency of flight and upgrade information - Departure City
sns.set(rc={'figure.figsize':(10,5)})
ax = sns.countplot(x="city_dep", hue="upgrade_ind" ,data=df_subset_ancll_spend , palette='coolwarm', order=df_subset_ancll_spend.city_dep.value_counts().iloc[:7].index).set(title='Frequency of Flights from US Cities with Upgraded Cabin Information', xlabel='Departure City', ylabel='Frequency')

Frequency of the Flights¶

In [171]:
#Frequency of FLights to US Cities
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.countplot(x='city_arr', data=US_df,palette='coolwarm',order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Frequency of FLights to US Cities', xlabel='Destination City', ylabel='Frequency')
In [172]:
# Departure City upgrade and total revenue 
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.countplot(x='city_dep', data=US_df, palette='coolwarm',order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Frequency of FLights from US Cities', xlabel='Departure City', ylabel='Frequency')

Revenue and Airports¶

In [173]:
#International Total Revenue - Destination City
ax=sns.set(rc={'figure.figsize':(10,8)})
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="city_arr", y='Total_Revenue',hue='ancillary_spend_ind', data=overseas_df ,ci=None, palette='coolwarm', order=overseas_df.city_arr.value_counts().iloc[:7].index).set(title='International Total Revenue - Destination City', xlabel='Destination City', ylabel='Total Revenue')
In [174]:
#International Total Revenue - Departure City
ax=sns.set(rc={'figure.figsize':(10,8)})
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="city_dep", y='Total_Revenue',hue='ancillary_spend_ind', data=overseas_df ,ci=None, palette='coolwarm', order=overseas_df.city_dep.value_counts().iloc[:7].index).set(title='International Total Revenue - Departure City', xlabel='Departure City', ylabel='Total Revenue')
In [175]:
#Highest Total Revenue - Departure City
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="city_dep", y='Total_Revenue', hue='ancillary_spend_ind',data=US_df,ci=None,palette='coolwarm', order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue - Departure City', xlabel='Departure City', ylabel='Total Revenue')
In [176]:
#Highest Total Revenue - Destination City
ax=sns.set(rc={'figure.figsize':(10,8)})
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="city_arr", y='Total_Revenue',hue='ancillary_spend_ind', data=US_df ,ci=None, palette='coolwarm', order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue - Destination City', xlabel='Destination City', ylabel='Total Revenue')
In [177]:
#Highest Total Revenue with Loyalty Level Information - Destination City
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_arr", y='Total_Revenue', hue="LYLTY_LEVEL_CD", palette='coolwarm', data=US_df,ci=None, order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue with Loyalty Level Information - Destination City', xlabel='Destination City', ylabel='Total Revenue')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[177]:
<matplotlib.legend.Legend at 0x7f17dbe71438>
In [178]:
#Highest Total Revenue with Loyalty Level Information - Departure City
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_dep", y='Total_Revenue', hue="LYLTY_LEVEL_CD", palette='coolwarm', data=US_df,ci=None,order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue with Loyalty Level Information - Departure City', xlabel='Departure City', ylabel='Total Revenue')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[178]:
<matplotlib.legend.Legend at 0x7f17db01dcc0>
In [179]:
# Highest Total Revenue from US to Domestic and International Flights
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_dep", y='Total_Revenue', hue="SEG_INTRVL_TYPE", ci=None,palette='coolwarm', data=df_subset_spend, order=df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue from US to Domestic and International Flights', xlabel='Departure City', ylabel='Total Revenue')
In [180]:
#Highest Total Revenue from International Flights
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_arr", y='Total_Revenue', hue='ancillary_spend_ind', ci=None,palette='coolwarm', data=overseas_df,order=pd.value_counts(overseas_df['city_arr']).iloc[:7].index).set(title='Highest Total Revenue from International Flights', xlabel='Destination City', ylabel='Total Revenue')
In [181]:
#Highest Total Revenue from International Flights
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_dep", y='Total_Revenue', hue='ancillary_spend_ind', ci=None,palette='coolwarm', data=overseas_df,order=pd.value_counts(overseas_df['city_dep']).iloc[:7].index).set(title='Highest Total Revenue from International Flights', xlabel='Departure City', ylabel='Total Revenue')

Geo Analysis¶

In [182]:
# Create frequency of flight
df_join.groupby(['DEP_CNTRY_CD',
                 'ARVL_CNTRY_CD',
                 'city_dep',
                 'city_arr',
                 'location_lat_dep',
                 'location_long_dep',
                 'location_lat_arr',
                 'location_long_arr'
                 ]).size().reset_index(name="number_of_flight").to_csv('flight Frequency.csv')
flight_freq= pd.read_csv('flight frequency.csv')
#International Flights
overseas_df = flight_freq[(flight_freq["DEP_CNTRY_CD"] != flight_freq["ARVL_CNTRY_CD"]) & (flight_freq["ARVL_CNTRY_CD"] != "US")]
#print("International Travel Dataset Size After Filtering : ", overseas_df.shape)
#Domestic Flights
US_df = flight_freq[flight_freq["DEP_CNTRY_CD"] == flight_freq["ARVL_CNTRY_CD"]]
#print("International Travel Dataset Size After Filtering : ", US_df.shape)
In [183]:
#Reading World map 
with plt.style.context(("seaborn", "ggplot")):
    world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
world.head()
Out[183]:
pop_est continent name iso_a3 gdp_md_est geometry
0 920938 Oceania Fiji FJI 8374.00 MULTIPOLYGON (((180.00000 -16.06713, 180.00000...
1 53950935 Africa Tanzania TZA 150600.00 POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...
2 603253 Africa W. Sahara ESH 906.50 POLYGON ((-8.66559 27.65643, -8.66512 27.58948...
3 35623680 North America Canada CAN 1674000.00 MULTIPOLYGON (((-122.84000 49.00000, -122.9742...
4 326625791 North America United States of America USA 18560000.00 MULTIPOLYGON (((-122.84000 49.00000, -120.0000...
In [184]:
with plt.style.context(("seaborn", "ggplot")):
    ## Plot world
    world.plot(figsize=(60,10), edgecolor="blue", color="white");

    ## Loop through each flight plotting line depicting flight between source and destination
    ## We are also plotting scatter points depicting source and destinations
    ## Aprt from that we also have added logic for labels to destination cities.
    for slat,dlat, slon, dlon, num_flights, src_city, dest_city in zip(overseas_df["location_lat_dep"], overseas_df["location_lat_arr"], overseas_df["location_long_dep"], overseas_df["location_long_arr"], overseas_df["number_of_flight"], overseas_df["city_dep"], overseas_df["city_arr"]):
        plt.plot([slon , dlon], [slat, dlat], linewidth=num_flights/20, color="red", alpha=0.5)
        plt.scatter( [slon, dlon], [slat, dlat], color="blue", alpha=1, s=num_flights/2)

        #plt.text(slon+5, slat+5, src_city, fontsize=9, color="blue",alpha=1.0, horizontalalignment='center', verticalalignment='center')
        #plt.text(dlon+5, dlat+5, dest_city, fontsize=9, color="black", alpha=1, horizontalalignment='center', verticalalignment='center')

    plt.title("Connection Map Depicting Flights from US to All Other Countries")

    #plt.savefig("connection-map-geopandas-3.png", dpi=100)
In [187]:
## Plot world
with plt.style.context(("seaborn", "ggplot")):
    world[world.name == "United States of America"].plot(figsize=(15,15), edgecolor="blue", color="white");

    ## Loop through each flight plotting line depicting flight between source and destination
    ## We are also plotting scatter points depicting source and destinations.
    ## Aprt from that we also have added logic for labels of source and destination cities.
    for slat,dlat, slon, dlon, num_flights, src_city, dest_city in zip(US_df["location_lat_dep"], US_df["location_lat_arr"], US_df["location_long_dep"], US_df["location_long_arr"], US_df["number_of_flight"], US_df["city_dep"], US_df["city_arr"]):
        plt.plot([slon , dlon], [slat, dlat], linewidth=num_flights/300, color="lime", alpha=1)
        plt.scatter( [slon, dlon], [slat, dlat], color="orangered", alpha=1, s=num_flights/10)

        #plt.text(slon+0.5, slat+0.5, src_city, fontsize=9, color="dodgerblue", alpha=0.1, horizontalalignment='center', verticalalignment='center')
        #plt.text(dlon+0.5, dlat+0.5, dest_city, fontsize=9, color="dodgerblue", alpha=0.1, horizontalalignment='center', verticalalignment='center')

    plt.title("Connection Map Depicting Flights between Cities in United States")

    #plt.savefig("connection-map-geopandas-5.png", dpi=100)